scrapy爬虫完整实例
本文主要通过实例介绍了scrapy框架的使用,分享了两个例子,爬豆瓣文本例程douban和图片例程douban_imgs,具体如下。
例程1:douban
目录树
douban --douban --spiders --__init__.py --bookspider.py --douban_comment_spider.py --doumailspider.py --__init__.py --items.py --pipelines.py --settings.py --scrapy.cfg
–spiders–init.py
#ThispackagewillcontainthespidersofyourScrapyproject # #Pleaserefertothedocumentationforinformationonhowtocreateandmanage #yourspiders.
bookspider.py
#-*-coding:utf-8-*- '''bysudorm-rfhttp://imchenkun.com''' importscrapy fromdouban.itemsimportDoubanBookItem classBookSpider(scrapy.Spider): name='douban-book' allowed_domains=['douban.com'] start_urls=[ 'https://book.douban.com/top250' ] defparse(self,response): #请求第一页 yieldscrapy.Request(response.url,callback=self.parse_next) #请求其它页 forpageinresponse.xpath('//div[@class="paginator"]/a'): link=page.xpath('@href').extract()[0] yieldscrapy.Request(link,callback=self.parse_next) defparse_next(self,response): foriteminresponse.xpath('//tr[@class="item"]'): book=DoubanBookItem() book['name']=item.xpath('td[2]/div[1]/a/@title').extract()[0] book['content']=item.xpath('td[2]/p/text()').extract()[0] book['ratings']=item.xpath('td[2]/div[2]/span[2]/text()').extract()[0] yieldbook
douban_comment_spider.py
#-*-coding:utf-8-*- importscrapy fromfakerimportFactory fromdouban.itemsimportDoubanMovieCommentItem importurlparse f=Factory.create() classMailSpider(scrapy.Spider): name='douban-comment' allowed_domains=['accounts.douban.com','douban.com'] start_urls=[ 'https://www.douban.com/' ] headers={ 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding':'gzip,deflate,br', 'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Connection':'keep-alive', 'Host':'accounts.douban.com', 'User-Agent':f.user_agent() } formdata={ 'form_email':'你的邮箱', 'form_password':'你的密码', #'captcha-solution':'', #'captcha-id':'', 'login':'登录', 'redir':'https://www.douban.com/', 'source':'None' } defstart_requests(self): return[scrapy.Request(url='https://www.douban.com/accounts/login', headers=self.headers, meta={'cookiejar':1}, callback=self.parse_login)] defparse_login(self,response): #如果有验证码要人为处理 if'captcha_image'inresponse.body: print'Copythelink:' link=response.xpath('//img[@class="captcha_image"]/@src').extract()[0] printlink captcha_solution=raw_input('captcha-solution:') captcha_id=urlparse.parse_qs(urlparse.urlparse(link).query,True)['id'] self.formdata['captcha-solution']=captcha_solution self.formdata['captcha-id']=captcha_id return[scrapy.FormRequest.from_response(response, formdata=self.formdata, headers=self.headers, meta={'cookiejar':response.meta['cookiejar']}, callback=self.after_login )] defafter_login(self,response): printresponse.status self.headers['Host']="www.douban.com" yieldscrapy.Request(url='https://movie.douban.com/subject/22266320/reviews', meta={'cookiejar':response.meta['cookiejar']}, headers=self.headers, callback=self.parse_comment_url) yieldscrapy.Request(url='https://movie.douban.com/subject/22266320/reviews', meta={'cookiejar':response.meta['cookiejar']}, headers=self.headers, callback=self.parse_next_page, dont_filter=True)#不去重 defparse_next_page(self,response): printresponse.status try: next_url=response.urljoin(response.xpath('//span[@class="next"]/a/@href').extract()[0]) print"下一页" printnext_url yieldscrapy.Request(url=next_url, meta={'cookiejar':response.meta['cookiejar']}, headers=self.headers, callback=self.parse_comment_url, dont_filter=True) yieldscrapy.Request(url=next_url, meta={'cookiejar':response.meta['cookiejar']}, headers=self.headers, callback=self.parse_next_page, dont_filter=True) except: print"NextpageError" return defparse_comment_url(self,response): printresponse.status foriteminresponse.xpath('//div[@class="mainreview-item"]'): comment_url=item.xpath('header/h3[@class="title"]/a/@href').extract()[0] comment_title=item.xpath('header/h3[@class="title"]/a/text()').extract()[0] printcomment_title printcomment_url yieldscrapy.Request(url=comment_url, meta={'cookiejar':response.meta['cookiejar']}, headers=self.headers, callback=self.parse_comment) defparse_comment(self,response): printresponse.status foriteminresponse.xpath('//div[@id="content"]'): comment=DoubanMovieCommentItem() comment['useful_num']=item.xpath('//div[@class="main-panel-useful"]/button[1]/text()').extract()[0].strip() comment['no_help_num']=item.xpath('//div[@class="main-panel-useful"]/button[2]/text()').extract()[0].strip() comment['people']=item.xpath('//span[@property="v:reviewer"]/text()').extract()[0] comment['people_url']=item.xpath('//header[@class="main-hd"]/a[1]/@href').extract()[0] comment['star']=item.xpath('//header[@class="main-hd"]/span[1]/@title').extract()[0] data_type=item.xpath('//div[@id="link-report"]/div/@data-original').extract()[0] print"data_type:"+data_type ifdata_type=='0': comment['comment']="\t#####\t".join(map(lambdax:x.strip(),item.xpath('//div[@id="link-report"]/div/p/text()').extract())) elifdata_type=='1': comment['comment']="\t#####\t".join(map(lambdax:x.strip(),item.xpath('//div[@id="link-report"]/div[1]/text()').extract())) comment['title']=item.xpath('//span[@property="v:summary"]/text()').extract()[0] comment['comment_page_url']=response.url #printcomment yieldcomment
doumailspider.py
#-*-coding:utf-8-*- '''bysudorm-rfhttp://imchenkun.com''' importscrapy fromfakerimportFactory fromdouban.itemsimportDoubanMailItem importurlparse f=Factory.create() classMailSpider(scrapy.Spider): name='douban-mail' allowed_domains=['accounts.douban.com','douban.com'] start_urls=[ 'https://www.douban.com/' ] headers={ 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding':'gzip,deflate,br', 'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Connection':'keep-alive', 'Host':'accounts.douban.com', 'User-Agent':f.user_agent() } formdata={ 'form_email':'你的邮箱', 'form_password':'你的密码', #'captcha-solution':'', #'captcha-id':'', 'login':'登录', 'redir':'https://www.douban.com/', 'source':'None' } defstart_requests(self): return[scrapy.Request(url='https://www.douban.com/accounts/login', headers=self.headers, meta={'cookiejar':1}, callback=self.parse_login)] defparse_login(self,response): #如果有验证码要人为处理 if'captcha_image'inresponse.body: print'Copythelink:' link=response.xpath('//img[@class="captcha_image"]/@src').extract()[0] printlink captcha_solution=raw_input('captcha-solution:') captcha_id=urlparse.parse_qs(urlparse.urlparse(link).query,True)['id'] self.formdata['captcha-solution']=captcha_solution self.formdata['captcha-id']=captcha_id return[scrapy.FormRequest.from_response(response, formdata=self.formdata, headers=self.headers, meta={'cookiejar':response.meta['cookiejar']}, callback=self.after_login )] defafter_login(self,response): printresponse.status self.headers['Host']="www.douban.com" returnscrapy.Request(url='https://www.douban.com/doumail/', meta={'cookiejar':response.meta['cookiejar']}, headers=self.headers, callback=self.parse_mail) defparse_mail(self,response): printresponse.status foriteminresponse.xpath('//div[@class="doumail-list"]/ul/li'): mail=DoubanMailItem() mail['sender_time']=item.xpath('div[2]/div/span[1]/text()').extract()[0] mail['sender_from']=item.xpath('div[2]/div/span[2]/text()').extract()[0] mail['url']=item.xpath('div[2]/p/a/@href').extract()[0] mail['title']=item.xpath('div[2]/p/a/text()').extract()[0] printmail yieldmail
init.py
(此文件内无代码)
items.py
#-*-coding:utf-8-*- importscrapy classDoubanBookItem(scrapy.Item): name=scrapy.Field()#书名 price=scrapy.Field()#价格 edition_year=scrapy.Field()#出版年份 publisher=scrapy.Field()#出版社 ratings=scrapy.Field()#评分 author=scrapy.Field()#作者 content=scrapy.Field() classDoubanMailItem(scrapy.Item): sender_time=scrapy.Field()#发送时间 sender_from=scrapy.Field()#发送人 url=scrapy.Field()#豆邮详细地址 title=scrapy.Field()#豆邮标题 classDoubanMovieCommentItem(scrapy.Item): useful_num=scrapy.Field()#多少人评论有用 no_help_num=scrapy.Field()#多少人评论无用 people=scrapy.Field()#评论者 people_url=scrapy.Field()#评论者页面 star=scrapy.Field()#评分 comment=scrapy.Field()#评论 title=scrapy.Field()#标题 comment_page_url=scrapy.Field()#当前页
pipelines.py
#-*-coding:utf-8-*- classDoubanBookPipeline(object): defprocess_item(self,item,spider): info=item['content'].split('/')#[法]圣埃克苏佩里/马振聘/人民文学出版社/2003-8/22.00元 item['name']=item['name'] item['price']=info[-1] item['edition_year']=info[-2] item['publisher']=info[-3] returnitem classDoubanMailPipeline(object): defprocess_item(self,item,spider): item['title']=item['title'].replace('','').replace('\\n','') returnitem classDoubanMovieCommentPipeline(object): defprocess_item(self,item,spider): returnitem
settings.py
#-*-coding:utf-8-*- #Scrapysettingsfordoubanproject # #Forsimplicity,thisfilecontainsonlysettingsconsideredimportantor #commonlyused.Youcanfindmoresettingsconsultingthedocumentation: # #http://doc.scrapy.org/en/latest/topics/settings.html #http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME='douban' SPIDER_MODULES=['douban.spiders'] NEWSPIDER_MODULE='douban.spiders' #Crawlresponsiblybyidentifyingyourself(andyourwebsite)ontheuser-agent fromfakerimportFactory f=Factory.create() USER_AGENT=f.user_agent() #Obeyrobots.txtrules ROBOTSTXT_OBEY=True #ConfiguremaximumconcurrentrequestsperformedbyScrapy(default:16) #CONCURRENT_REQUESTS=32 #Configureadelayforrequestsforthesamewebsite(default:0) #Seehttp://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay #Seealsoautothrottlesettingsanddocs #DOWNLOAD_DELAY=3 #Thedownloaddelaysettingwillhonoronlyoneof: #CONCURRENT_REQUESTS_PER_DOMAIN=16 #CONCURRENT_REQUESTS_PER_IP=16 #Disablecookies(enabledbydefault) #COOKIES_ENABLED=False #DisableTelnetConsole(enabledbydefault) #TELNETCONSOLE_ENABLED=False #Overridethedefaultrequestheaders: DEFAULT_REQUEST_HEADERS={ 'Host':'book.douban.com', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding':'gzip,deflate,br', 'Connection':'keep-alive', } #DEFAULT_REQUEST_HEADERS={ #'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', #'Accept-Language':'en', #} #Enableordisablespidermiddlewares #Seehttp://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES={ #'douban.middlewares.MyCustomSpiderMiddleware':543, #} #Enableordisabledownloadermiddlewares #Seehttp://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES={ #'douban.middlewares.MyCustomDownloaderMiddleware':543, #} #Enableordisableextensions #Seehttp://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS={ #'scrapy.extensions.telnet.TelnetConsole':None, #} #Configureitempipelines #Seehttp://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES={ #'douban.pipelines.DoubanBookPipeline':300, #'douban.pipelines.DoubanMailPipeline':600, 'douban.pipelines.DoubanMovieCommentPipeline':900, } #EnableandconfiguretheAutoThrottleextension(disabledbydefault) #Seehttp://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED=True #Theinitialdownloaddelay #AUTOTHROTTLE_START_DELAY=5 #Themaximumdownloaddelaytobesetincaseofhighlatencies #AUTOTHROTTLE_MAX_DELAY=60 #TheaveragenumberofrequestsScrapyshouldbesendinginparallelto #eachremoteserver #AUTOTHROTTLE_TARGET_CONCURRENCY=1.0 #Enableshowingthrottlingstatsforeveryresponsereceived: #AUTOTHROTTLE_DEBUG=False #EnableandconfigureHTTPcaching(disabledbydefault) #Seehttp://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED=True #HTTPCACHE_EXPIRATION_SECS=0 #HTTPCACHE_DIR='httpcache' #HTTPCACHE_IGNORE_HTTP_CODES=[] #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
scrapy.cfg
#Automaticallycreatedby:scrapystartproject # #Formoreinformationaboutthe[deploy]sectionsee: #https://scrapyd.readthedocs.org/en/latest/deploy.html [settings] default=douban.settings [deploy] #url=http://localhost:6800/ project=douban
例程2:douban_imgs
目录树
douban_imgs --douban --spiders --__init__.py --download_douban.py --__init__.py --items.py --pipelines.py --run_spider.py --settings.py --scrapy.cfg
–spiders–init.py
#ThispackagewillcontainthespidersofyourScrapyproject # #Pleaserefertothedocumentationforinformationonhowtocreateandmanage #yourspiders.
download_douban.py
#coding=utf-8 fromscrapy.spidersimportSpider importre fromscrapyimportRequest fromdouban_imgs.itemsimportDoubanImgsItem classdownload_douban(Spider): name='download_douban' default_headers={ 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding':'gzip,deflate,sdch,br', 'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6', 'Cache-Control':'max-age=0', 'Connection':'keep-alive', 'Host':'www.douban.com', 'User-Agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_11_4)AppleWebKit/537.36(KHTML,likeGecko)Chrome/52.0.2743.116Safari/537.36', } def__init__(self,url='1638835355',*args,**kwargs): self.allowed_domains=['douban.com'] self.start_urls=[ 'http://www.douban.com/photos/album/%s/'%(url)] self.url=url #callthefatherbasefunction #super(download_douban,self).__init__(*args,**kwargs) defstart_requests(self): forurlinself.start_urls: yieldRequest(url=url,headers=self.default_headers,callback=self.parse) defparse(self,response): list_imgs=response.xpath('//div[@class="photolstclearfix"]//img/@src').extract() iflist_imgs: item=DoubanImgsItem() item['image_urls']=list_imgs yielditem
init.py
(此文件内无代码)
items.py
#-*-coding:utf-8-*- #Defineherethemodelsforyourscrapeditems # #Seedocumentationin: #http://doc.scrapy.org/en/latest/topics/items.html importscrapy fromscrapyimportItem,Field classDoubanImgsItem(scrapy.Item): #definethefieldsforyouritemherelike: #name=scrapy.Field() image_urls=Field() images=Field() image_paths=Field()
pipelines.py
#-*-coding:utf-8-*- #Defineyouritempipelineshere # #Don'tforgettoaddyourpipelinetotheITEM_PIPELINESsetting #See:http://doc.scrapy.org/en/latest/topics/item-pipeline.html fromscrapy.pipelines.imagesimportImagesPipeline fromscrapy.exceptionsimportDropItem fromscrapyimportRequest fromscrapyimportlog classDoubanImgsPipeline(object): defprocess_item(self,item,spider): returnitem classDoubanImgDownloadPipeline(ImagesPipeline): default_headers={ 'accept':'image/webp,image/*,*/*;q=0.8', 'accept-encoding':'gzip,deflate,sdch,br', 'accept-language':'zh-CN,zh;q=0.8,en;q=0.6', 'cookie':'bid=yQdC/AzTaCw', 'referer':'https://www.douban.com/photos/photo/2370443040/', 'user-agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_11_4)AppleWebKit/537.36(KHTML,likeGecko)Chrome/52.0.2743.116Safari/537.36', } defget_media_requests(self,item,info): forimage_urlinitem['image_urls']: self.default_headers['referer']=image_url yieldRequest(image_url,headers=self.default_headers) defitem_completed(self,results,item,info): image_paths=[x['path']forok,xinresultsifok] ifnotimage_paths: raiseDropItem("Itemcontainsnoimages") item['image_paths']=image_paths returnitem
run_spider.py
fromscrapyimportcmdline cmd_str='scrapycrawldownload_douban' cmdline.execute(cmd_str.split(''))
settings.py
#-*-coding:utf-8-*- #Scrapysettingsfordouban_imgsproject # #Forsimplicity,thisfilecontainsonlysettingsconsideredimportantor #commonlyused.Youcanfindmoresettingsconsultingthedocumentation: # #http://doc.scrapy.org/en/latest/topics/settings.html #http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME='douban_imgs' SPIDER_MODULES=['douban_imgs.spiders'] NEWSPIDER_MODULE='douban_imgs.spiders' #Crawlresponsiblybyidentifyingyourself(andyourwebsite)ontheuser-agent #USER_AGENT='douban_imgs(+http://www.yourdomain.com)' #ConfiguremaximumconcurrentrequestsperformedbyScrapy(default:16) #CONCURRENT_REQUESTS=32 #Configureadelayforrequestsforthesamewebsite(default:0) #Seehttp://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay #Seealsoautothrottlesettingsanddocs #DOWNLOAD_DELAY=3 #Thedownloaddelaysettingwillhonoronlyoneof: #CONCURRENT_REQUESTS_PER_DOMAIN=16 #CONCURRENT_REQUESTS_PER_IP=16 #Disablecookies(enabledbydefault) #COOKIES_ENABLED=False #DisableTelnetConsole(enabledbydefault) #TELNETCONSOLE_ENABLED=False #Overridethedefaultrequestheaders: #DEFAULT_REQUEST_HEADERS={ #'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', #'Accept-Language':'en', #} #Enableordisablespidermiddlewares #Seehttp://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES={ #'douban_imgs.middlewares.MyCustomSpiderMiddleware':543, #} #Enableordisabledownloadermiddlewares #Seehttp://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES={ #'douban_imgs.middlewares.MyCustomDownloaderMiddleware':543, #} #Enableordisableextensions #Seehttp://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS={ #'scrapy.telnet.TelnetConsole':None, #} #Configureitempipelines #Seehttp://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES={ 'douban_imgs.pipelines.DoubanImgDownloadPipeline':300, } IMAGES_STORE='D:\\doubanimgs' #IMAGES_STORE='/tmp' IMAGES_EXPIRES=90 #EnableandconfiguretheAutoThrottleextension(disabledbydefault) #Seehttp://doc.scrapy.org/en/latest/topics/autothrottle.html #NOTE:AutoThrottlewillhonourthestandardsettingsforconcurrencyanddelay #AUTOTHROTTLE_ENABLED=True #Theinitialdownloaddelay #AUTOTHROTTLE_START_DELAY=5 #Themaximumdownloaddelaytobesetincaseofhighlatencies #AUTOTHROTTLE_MAX_DELAY=60 #Enableshowingthrottlingstatsforeveryresponsereceived: #AUTOTHROTTLE_DEBUG=False #EnableandconfigureHTTPcaching(disabledbydefault) #Seehttp://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED=True #HTTPCACHE_EXPIRATION_SECS=0 #HTTPCACHE_DIR='httpcache' #HTTPCACHE_IGNORE_HTTP_CODES=[] #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
scrapy.cfg
#Automaticallycreatedby:scrapystartproject # #Formoreinformationaboutthe[deploy]sectionsee: #https://scrapyd.readthedocs.org/en/latest/deploy.html [settings] default=douban_imgs.settings [deploy] #url=http://localhost:6800/ project=douban_imgs
总结
以上就是本文关于scrapy爬虫完整实例的全部内容,希望对大家有所帮助。感兴趣的朋友可以继续参阅本站其他相关专题,如有不足之处,欢迎留言指出。感谢朋友们对本站的支持!