使用scrapy ImagesPipeline爬取图片资源的示例代码
这是一个使用scrapy的ImagesPipeline爬取下载图片的示例,生成的图片保存在爬虫的full文件夹里。
scrapystartprojectDoubanImgs
cdDoubanImgs
scrapygenspiderdownload_douban douban.com
vimspiders/download_douban.py
#coding=utf-8
fromscrapy.spidersimportSpider
importre
fromscrapyimportRequest
from..itemsimportDoubanImgsItem
classdownload_douban(Spider):
name='download_douban'
default_headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip,deflate,sdch,br',
'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Host':'www.douban.com',
'User-Agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_11_4)AppleWebKit/537.36(KHTML,likeGecko)Chrome/52.0.2743.116Safari/537.36',
}
def__init__(self,url='1638835355',*args,**kwargs):
self.allowed_domains=['douban.com']
self.start_urls=[]
foriinxrange(23):
ifi==0:
page_url='http://www.douban.com/photos/album/'+url
else:
page_url='http://www.douban.com/photos/album/'+url+'/?start='+str(i*18)
self.start_urls.append(page_url)
self.url=url
#callthefatherbasefunction
#super(download_douban,self).__init__(*args,**kwargs)
defstart_requests(self):
forurlinself.start_urls:
yieldRequest(url=url,headers=self.default_headers,callback=self.parse)
defparse(self,response):
list_imgs=response.xpath('//div[@class="photolstclearfix"]//img/@src').extract()
iflist_imgs:
item=DoubanImgsItem()
item['image_urls']=list_imgs
yielditem
vimsettings.py
#-*-coding:utf-8-*-
#ScrapysettingsforDoubanImgsproject
#
#Forsimplicity,thisfilecontainsonlysettingsconsideredimportantor
#commonlyused.Youcanfindmoresettingsconsultingthedocumentation:
#
#https://doc.scrapy.org/en/latest/topics/settings.html
#https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME='DoubanImgs'
SPIDER_MODULES=['DoubanImgs.spiders']
NEWSPIDER_MODULE='DoubanImgs.spiders'
ITEM_PIPELINES={
'DoubanImgs.pipelines.DoubanImgDownloadPipeline':300,
}
IMAGES_STORE='.'
IMAGES_EXPIRES=90
#Crawlresponsiblybyidentifyingyourself(andyourwebsite)ontheuser-agent
#USER_AGENT='DoubanImgs(+http://www.yourdomain.com)'
#Obeyrobots.txtrules
ROBOTSTXT_OBEY=False
#ConfiguremaximumconcurrentrequestsperformedbyScrapy(default:16)
#CONCURRENT_REQUESTS=32
#Configureadelayforrequestsforthesamewebsite(default:0)
#Seehttps://doc.scrapy.org/en/latest/topics/settings.html#download-delay
#Seealsoautothrottlesettingsanddocs
DOWNLOAD_DELAY=0.5
#Thedownloaddelaysettingwillhonoronlyoneof:
#CONCURRENT_REQUESTS_PER_DOMAIN=16
#CONCURRENT_REQUESTS_PER_IP=16
#Disablecookies(enabledbydefault)
#COOKIES_ENABLED=False
#DisableTelnetConsole(enabledbydefault)
#TELNETCONSOLE_ENABLED=False
#Overridethedefaultrequestheaders:
#DEFAULT_REQUEST_HEADERS={
#'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#'Accept-Language':'en',
#}
#Enableordisablespidermiddlewares
#Seehttps://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES={
#'DoubanImgs.middlewares.DoubanimgsSpiderMiddleware':543,
#}
#Enableordisabledownloadermiddlewares
#Seehttps://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES={
#'DoubanImgs.middlewares.DoubanimgsDownloaderMiddleware':543,
#}
#Enableordisableextensions
#Seehttps://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS={
#'scrapy.extensions.telnet.TelnetConsole':None,
#}
#Configureitempipelines
#Seehttps://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES={
#'DoubanImgs.pipelines.DoubanimgsPipeline':300,
#}
#EnableandconfiguretheAutoThrottleextension(disabledbydefault)
#Seehttps://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED=True
#Theinitialdownloaddelay
#AUTOTHROTTLE_START_DELAY=5
#Themaximumdownloaddelaytobesetincaseofhighlatencies
#AUTOTHROTTLE_MAX_DELAY=60
#TheaveragenumberofrequestsScrapyshouldbesendinginparallelto
#eachremoteserver
#AUTOTHROTTLE_TARGET_CONCURRENCY=1.0
#Enableshowingthrottlingstatsforeveryresponsereceived:
#AUTOTHROTTLE_DEBUG=False
#EnableandconfigureHTTPcaching(disabledbydefault)
#Seehttps://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED=True
#HTTPCACHE_EXPIRATION_SECS=0
#HTTPCACHE_DIR='httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES=[]
#HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
vimitems.py
#-*-coding:utf-8-*- #Defineherethemodelsforyourscrapeditems # #Seedocumentationin: #https://doc.scrapy.org/en/latest/topics/items.html importscrapy fromscrapyimportField classDoubanImgsItem(scrapy.Item): #definethefieldsforyouritemherelike: #name=scrapy.Field() image_urls=Field() images=Field() image_paths=Field()
vimpipelines.py
#-*-coding:utf-8-*-
#Defineyouritempipelineshere
#
#Don'tforgettoaddyourpipelinetotheITEM_PIPELINESsetting
#See:http://doc.scrapy.org/en/latest/topics/item-pipeline.html
fromscrapy.pipelines.imagesimportImagesPipeline
fromscrapy.exceptionsimportDropItem
fromscrapyimportRequest
fromscrapyimportlog
classDoubanImgsPipeline(object):
defprocess_item(self,item,spider):
returnitem
classDoubanImgDownloadPipeline(ImagesPipeline):
default_headers={
'accept':'image/webp,image/*,*/*;q=0.8',
'accept-encoding':'gzip,deflate,sdch,br',
'accept-language':'zh-CN,zh;q=0.8,en;q=0.6',
'cookie':'bid=yQdC/AzTaCw',
'referer':'https://www.douban.com/photos/photo/2370443040/',
'user-agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_11_4)AppleWebKit/537.36(KHTML,likeGecko)Chrome/52.0.2743.116Safari/537.36',
}
defget_media_requests(self,item,info):
forimage_urlinitem['image_urls']:
self.default_headers['referer']=image_url
yieldRequest(image_url,headers=self.default_headers)
defitem_completed(self,results,item,info):
image_paths=[x['path']forok,xinresultsifok]
ifnotimage_paths:
raiseDropItem("Itemcontainsnoimages")
item['image_paths']=image_paths
returnitem
到此这篇关于使用scrapyImagesPipeline爬取图片资源的示例代码的文章就介绍到这了,更多相关scrapyImagesPipeline爬取图片内容请搜索毛票票以前的文章或继续浏览下面的相关文章希望大家以后多多支持毛票票!