Python实现爬虫从网络上下载文档的实例代码
最近在学习Python,自然接触到了爬虫,写了一个小型爬虫软件,从初始Url解析网页,使用正则获取待爬取链接,使用beautifulsoup解析获取文本,使用自己写的输出器可以将文本输出保存,具体代码如下:
Spider_main.py
#coding:utf8 frombaike_spiderimporturl_manager,html_downloader,html_parser,html_outputer classSpiderMain(object): def__init__(self): self.urls=url_manager.UrlManager() self.downloader=html_downloader.HtmlDownloader() self.parser=html_parser.HtmlParser() self.outputer=html_outputer.HtmlOutputer() defcraw(self,root_url): count=1 self.urls.add_new_url(root_url) whileself.urls.has_new_url(): print("self.urls.has%s"%self.urls.new_urls) try: new_url=self.urls.get_new_url() print("craw%d:%s"%(count,new_url)) html_cont=self.downloader.download(new_url) new_urls,new_data=self.parser.parse(new_url,html_cont) self.urls.add_new_urls(new_urls) self.outputer.collect_data(new_data) ifcount==1000: break count=count+1 except: print("crawfailed") self.outputer.output_html() self.outputer.output_txt() if__name__=='__main__': root_url="http://www.shushu8.com/jiangnan/longzu2qianzhuan/1" obj_spider=SpiderMain() obj_spider.craw(root_url)
url_manager.py
classUrlManager(object): def__init__(self): self.new_urls=set() self.old_urls=set() defadd_new_url(self,url): print(url) ifurlisNone: return ifurlnotinself.new_urlsandurlnotinself.old_urls: self.new_urls.add(url) defhas_new_url(self): returnlen(self.new_urls)!=0 defget_new_url(self): new_url=self.new_urls.pop() self.old_urls.add(new_url) #print('newurlis%s'%new_url) returnnew_url defadd_new_urls(self,urls): print("add_new_urls%s"%urls) ifurlsisNoneorlen(urls)==0: return forurlinurls: self.add_new_url(url) print(url)
html_parser.py
importre importurllib.parse frombs4importBeautifulSoup classHtmlParser(object): defparse(self,page_url,html_cont): ifpage_urlisNoneorhtml_contisNone: return soup=BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8') new_urls=self._get_new_urls(page_url,soup) print("parsenew_urls%s"%new_urls) new_data=self._get_new_data(page_url,soup) returnnew_urls,new_data def_get_new_data(self,page_url,soup): res_data={} res_data['url']=page_url print(page_url) title_node=soup.find(class_="title").find("h1") print(title_node.get_text()) res_data['title']=title_node.get_text() print("_get_new_data") summary_node=soup.find('pre') print(summary_node.get_text()) res_data['summary']=summary_node.get_text() returnres_data def_get_new_urls(self,page_url,soup): new_urls=set() links=soup.find_all('a',href=re.compile(r"/jiangnan/")) print(links) forlinkinlinks: new_url=link['href'] new_full_url=urllib.parse.urljoin(page_url,new_url) new_urls.add(new_full_url) #print(new_full_url) returnnew_urls
html_downloader.py
importurllib.request classHtmlDownloader(object): defdownload(self,url): ifurlisNone: returnNone response=urllib.request.urlopen(url) ifresponse.getcode()!=200: returnNone returnresponse.read()
html_outputer.py
classHtmlOutputer(object): def__init__(self): self.datas=[] defcollect_data(self,data): ifdataisNone: return self.datas.append(data) defoutput_txt(self): fout=open('output.txt','w',encoding='utf-8') fordatainself.datas: fout.write('%s\n'%data['title']) fout.write('%s\n'%data['summary']) defoutput_html(self): fout=open('output.html','w',encoding='utf-8') fout.write('') fout.write('') fout.write('
%s | '%data['url']) fout.write('%s | '%data['title']) fout.write('%s | '%data['summary']) fout.write('
总结
以上所述是小编给大家介绍的Python实现爬虫从网络上下载文档的实例代码,希望对大家有所帮助,如果大家有任何疑问请给我留言,小编会及时回复大家的。在此也非常感谢大家对毛票票网站的支持!