使用scrapy ImagesPipeline爬取图片资源的示例代码
这是一个使用scrapy的ImagesPipeline爬取下载图片的示例,生成的图片保存在爬虫的full文件夹里。
scrapystartprojectDoubanImgs
cdDoubanImgs
scrapygenspiderdownload_douban douban.com
vimspiders/download_douban.py
#coding=utf-8 fromscrapy.spidersimportSpider importre fromscrapyimportRequest from..itemsimportDoubanImgsItem classdownload_douban(Spider): name='download_douban' default_headers={ 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding':'gzip,deflate,sdch,br', 'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6', 'Cache-Control':'max-age=0', 'Connection':'keep-alive', 'Host':'www.douban.com', 'User-Agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_11_4)AppleWebKit/537.36(KHTML,likeGecko)Chrome/52.0.2743.116Safari/537.36', } def__init__(self,url='1638835355',*args,**kwargs): self.allowed_domains=['douban.com'] self.start_urls=[] foriinxrange(23): ifi==0: page_url='http://www.douban.com/photos/album/'+url else: page_url='http://www.douban.com/photos/album/'+url+'/?start='+str(i*18) self.start_urls.append(page_url) self.url=url #callthefatherbasefunction #super(download_douban,self).__init__(*args,**kwargs) defstart_requests(self): forurlinself.start_urls: yieldRequest(url=url,headers=self.default_headers,callback=self.parse) defparse(self,response): list_imgs=response.xpath('//div[@class="photolstclearfix"]//img/@src').extract() iflist_imgs: item=DoubanImgsItem() item['image_urls']=list_imgs yielditem
vimsettings.py
#-*-coding:utf-8-*- #ScrapysettingsforDoubanImgsproject # #Forsimplicity,thisfilecontainsonlysettingsconsideredimportantor #commonlyused.Youcanfindmoresettingsconsultingthedocumentation: # #https://doc.scrapy.org/en/latest/topics/settings.html #https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME='DoubanImgs' SPIDER_MODULES=['DoubanImgs.spiders'] NEWSPIDER_MODULE='DoubanImgs.spiders' ITEM_PIPELINES={ 'DoubanImgs.pipelines.DoubanImgDownloadPipeline':300, } IMAGES_STORE='.' IMAGES_EXPIRES=90 #Crawlresponsiblybyidentifyingyourself(andyourwebsite)ontheuser-agent #USER_AGENT='DoubanImgs(+http://www.yourdomain.com)' #Obeyrobots.txtrules ROBOTSTXT_OBEY=False #ConfiguremaximumconcurrentrequestsperformedbyScrapy(default:16) #CONCURRENT_REQUESTS=32 #Configureadelayforrequestsforthesamewebsite(default:0) #Seehttps://doc.scrapy.org/en/latest/topics/settings.html#download-delay #Seealsoautothrottlesettingsanddocs DOWNLOAD_DELAY=0.5 #Thedownloaddelaysettingwillhonoronlyoneof: #CONCURRENT_REQUESTS_PER_DOMAIN=16 #CONCURRENT_REQUESTS_PER_IP=16 #Disablecookies(enabledbydefault) #COOKIES_ENABLED=False #DisableTelnetConsole(enabledbydefault) #TELNETCONSOLE_ENABLED=False #Overridethedefaultrequestheaders: #DEFAULT_REQUEST_HEADERS={ #'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', #'Accept-Language':'en', #} #Enableordisablespidermiddlewares #Seehttps://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES={ #'DoubanImgs.middlewares.DoubanimgsSpiderMiddleware':543, #} #Enableordisabledownloadermiddlewares #Seehttps://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES={ #'DoubanImgs.middlewares.DoubanimgsDownloaderMiddleware':543, #} #Enableordisableextensions #Seehttps://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS={ #'scrapy.extensions.telnet.TelnetConsole':None, #} #Configureitempipelines #Seehttps://doc.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES={ #'DoubanImgs.pipelines.DoubanimgsPipeline':300, #} #EnableandconfiguretheAutoThrottleextension(disabledbydefault) #Seehttps://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED=True #Theinitialdownloaddelay #AUTOTHROTTLE_START_DELAY=5 #Themaximumdownloaddelaytobesetincaseofhighlatencies #AUTOTHROTTLE_MAX_DELAY=60 #TheaveragenumberofrequestsScrapyshouldbesendinginparallelto #eachremoteserver #AUTOTHROTTLE_TARGET_CONCURRENCY=1.0 #Enableshowingthrottlingstatsforeveryresponsereceived: #AUTOTHROTTLE_DEBUG=False #EnableandconfigureHTTPcaching(disabledbydefault) #Seehttps://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED=True #HTTPCACHE_EXPIRATION_SECS=0 #HTTPCACHE_DIR='httpcache' #HTTPCACHE_IGNORE_HTTP_CODES=[] #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
vimitems.py
#-*-coding:utf-8-*- #Defineherethemodelsforyourscrapeditems # #Seedocumentationin: #https://doc.scrapy.org/en/latest/topics/items.html importscrapy fromscrapyimportField classDoubanImgsItem(scrapy.Item): #definethefieldsforyouritemherelike: #name=scrapy.Field() image_urls=Field() images=Field() image_paths=Field()
vimpipelines.py
#-*-coding:utf-8-*- #Defineyouritempipelineshere # #Don'tforgettoaddyourpipelinetotheITEM_PIPELINESsetting #See:http://doc.scrapy.org/en/latest/topics/item-pipeline.html fromscrapy.pipelines.imagesimportImagesPipeline fromscrapy.exceptionsimportDropItem fromscrapyimportRequest fromscrapyimportlog classDoubanImgsPipeline(object): defprocess_item(self,item,spider): returnitem classDoubanImgDownloadPipeline(ImagesPipeline): default_headers={ 'accept':'image/webp,image/*,*/*;q=0.8', 'accept-encoding':'gzip,deflate,sdch,br', 'accept-language':'zh-CN,zh;q=0.8,en;q=0.6', 'cookie':'bid=yQdC/AzTaCw', 'referer':'https://www.douban.com/photos/photo/2370443040/', 'user-agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_11_4)AppleWebKit/537.36(KHTML,likeGecko)Chrome/52.0.2743.116Safari/537.36', } defget_media_requests(self,item,info): forimage_urlinitem['image_urls']: self.default_headers['referer']=image_url yieldRequest(image_url,headers=self.default_headers) defitem_completed(self,results,item,info): image_paths=[x['path']forok,xinresultsifok] ifnotimage_paths: raiseDropItem("Itemcontainsnoimages") item['image_paths']=image_paths returnitem
到此这篇关于使用scrapyImagesPipeline爬取图片资源的示例代码的文章就介绍到这了,更多相关scrapyImagesPipeline爬取图片内容请搜索毛票票以前的文章或继续浏览下面的相关文章希望大家以后多多支持毛票票!