Python实现爬虫从网络上下载文档的实例代码
最近在学习Python,自然接触到了爬虫,写了一个小型爬虫软件,从初始Url解析网页,使用正则获取待爬取链接,使用beautifulsoup解析获取文本,使用自己写的输出器可以将文本输出保存,具体代码如下:
Spider_main.py
#coding:utf8
frombaike_spiderimporturl_manager,html_downloader,html_parser,html_outputer
classSpiderMain(object):
def__init__(self):
self.urls=url_manager.UrlManager()
self.downloader=html_downloader.HtmlDownloader()
self.parser=html_parser.HtmlParser()
self.outputer=html_outputer.HtmlOutputer()
defcraw(self,root_url):
count=1
self.urls.add_new_url(root_url)
whileself.urls.has_new_url():
print("self.urls.has%s"%self.urls.new_urls)
try:
new_url=self.urls.get_new_url()
print("craw%d:%s"%(count,new_url))
html_cont=self.downloader.download(new_url)
new_urls,new_data=self.parser.parse(new_url,html_cont)
self.urls.add_new_urls(new_urls)
self.outputer.collect_data(new_data)
ifcount==1000:
break
count=count+1
except:
print("crawfailed")
self.outputer.output_html()
self.outputer.output_txt()
if__name__=='__main__':
root_url="http://www.shushu8.com/jiangnan/longzu2qianzhuan/1"
obj_spider=SpiderMain()
obj_spider.craw(root_url)
url_manager.py
classUrlManager(object):
def__init__(self):
self.new_urls=set()
self.old_urls=set()
defadd_new_url(self,url):
print(url)
ifurlisNone:
return
ifurlnotinself.new_urlsandurlnotinself.old_urls:
self.new_urls.add(url)
defhas_new_url(self):
returnlen(self.new_urls)!=0
defget_new_url(self):
new_url=self.new_urls.pop()
self.old_urls.add(new_url)
#print('newurlis%s'%new_url)
returnnew_url
defadd_new_urls(self,urls):
print("add_new_urls%s"%urls)
ifurlsisNoneorlen(urls)==0:
return
forurlinurls:
self.add_new_url(url)
print(url)
html_parser.py
importre
importurllib.parse
frombs4importBeautifulSoup
classHtmlParser(object):
defparse(self,page_url,html_cont):
ifpage_urlisNoneorhtml_contisNone:
return
soup=BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')
new_urls=self._get_new_urls(page_url,soup)
print("parsenew_urls%s"%new_urls)
new_data=self._get_new_data(page_url,soup)
returnnew_urls,new_data
def_get_new_data(self,page_url,soup):
res_data={}
res_data['url']=page_url
print(page_url)
title_node=soup.find(class_="title").find("h1")
print(title_node.get_text())
res_data['title']=title_node.get_text()
print("_get_new_data")
summary_node=soup.find('pre')
print(summary_node.get_text())
res_data['summary']=summary_node.get_text()
returnres_data
def_get_new_urls(self,page_url,soup):
new_urls=set()
links=soup.find_all('a',href=re.compile(r"/jiangnan/"))
print(links)
forlinkinlinks:
new_url=link['href']
new_full_url=urllib.parse.urljoin(page_url,new_url)
new_urls.add(new_full_url)
#print(new_full_url)
returnnew_urls
html_downloader.py
importurllib.request classHtmlDownloader(object): defdownload(self,url): ifurlisNone: returnNone response=urllib.request.urlopen(url) ifresponse.getcode()!=200: returnNone returnresponse.read()
html_outputer.py
classHtmlOutputer(object):
def__init__(self):
self.datas=[]
defcollect_data(self,data):
ifdataisNone:
return
self.datas.append(data)
defoutput_txt(self):
fout=open('output.txt','w',encoding='utf-8')
fordatainself.datas:
fout.write('%s\n'%data['title'])
fout.write('%s\n'%data['summary'])
defoutput_html(self):
fout=open('output.html','w',encoding='utf-8')
fout.write('')
fout.write('')
fout.write('| %s | '%data['url']) fout.write('%s | '%data['title']) fout.write('%s | '%data['summary']) fout.write('
总结
以上所述是小编给大家介绍的Python实现爬虫从网络上下载文档的实例代码,希望对大家有所帮助,如果大家有任何疑问请给我留言,小编会及时回复大家的。在此也非常感谢大家对毛票票网站的支持!