基于scrapy实现的简单蜘蛛采集程序
本文实例讲述了基于scrapy实现的简单蜘蛛采集程序。分享给大家供大家参考。具体如下:
#StandardPythonlibraryimports
#3rdpartyimports
fromscrapy.contrib.spidersimportCrawlSpider,Rule
fromscrapy.contrib.linkextractors.sgmlimportSgmlLinkExtractor
fromscrapy.selectorimportHtmlXPathSelector
#Myimports
frompoetry_analysis.itemsimportPoetryAnalysisItem
HTML_FILE_NAME=r'.+\.html'
classPoetryParser(object):
"""
Providescommonparsingmethodforpoemsformattedthisonespecificway.
"""
date_pattern=r'(\d{2}\w{3,9}\d{4})'
defparse_poem(self,response):
hxs=HtmlXPathSelector(response)
item=PoetryAnalysisItem()
#Allpoetrytextisinpretags
text=hxs.select('//pre/text()').extract()
item['text']=''.join(text)
item['url']=response.url
#head/titlecontainstitle-apoembyauthor
title_text=hxs.select('//head/title/text()').extract()[0]
item['title'],item['author']=title_text.split('-')
item['author']=item['author'].replace('apoemby','')
forkeyin['title','author']:
item[key]=item[key].strip()
item['date']=hxs.select("//p[@class='small']/text()").re(date_pattern)
returnitem
classPoetrySpider(CrawlSpider,PoetryParser):
name='example.com_poetry'
allowed_domains=['www.example.com']
root_path='someuser/poetry/'
start_urls=['http://www.example.com/someuser/poetry/recent/',
'http://www.example.com/someuser/poetry/less_recent/']
rules=[Rule(SgmlLinkExtractor(allow=[start_urls[0]+HTML_FILE_NAME]),
callback='parse_poem'),
Rule(SgmlLinkExtractor(allow=[start_urls[1]+HTML_FILE_NAME]),
callback='parse_poem')]
希望本文所述对大家的Python程序设计有所帮助。