使用scrapy ImagesPipeline爬取图片资源的示例代码

2023-07-14 13:56:03 385

这是一个使用scrapy的ImagesPipeline爬取下载图片的示例，生成的图片保存在爬虫的full文件夹里。

scrapystartprojectDoubanImgs

cdDoubanImgs

scrapygenspiderdownload_douban douban.com

vimspiders/download_douban.py

#coding=utf-8
fromscrapy.spidersimportSpider
importre
fromscrapyimportRequest
from..itemsimportDoubanImgsItem


classdownload_douban(Spider):
name='download_douban'

default_headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip,deflate,sdch,br',
'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Host':'www.douban.com',
'User-Agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_11_4)AppleWebKit/537.36(KHTML,likeGecko)Chrome/52.0.2743.116Safari/537.36',
}

def__init__(self,url='1638835355',*args,**kwargs):
self.allowed_domains=['douban.com']
self.start_urls=[]
foriinxrange(23):
ifi==0:
page_url='http://www.douban.com/photos/album/'+url
else:
page_url='http://www.douban.com/photos/album/'+url+'/?start='+str(i*18)
self.start_urls.append(page_url)
self.url=url
#callthefatherbasefunction

#super(download_douban,self).__init__(*args,**kwargs)

defstart_requests(self):

forurlinself.start_urls:
yieldRequest(url=url,headers=self.default_headers,callback=self.parse)

defparse(self,response):
list_imgs=response.xpath('//div[@class="photolstclearfix"]//img/@src').extract()
iflist_imgs:
item=DoubanImgsItem()
item['image_urls']=list_imgs
yielditem

vimsettings.py

#-*-coding:utf-8-*-

#ScrapysettingsforDoubanImgsproject
#
#Forsimplicity,thisfilecontainsonlysettingsconsideredimportantor
#commonlyused.Youcanfindmoresettingsconsultingthedocumentation:
#
#https://doc.scrapy.org/en/latest/topics/settings.html
#https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME='DoubanImgs'

SPIDER_MODULES=['DoubanImgs.spiders']
NEWSPIDER_MODULE='DoubanImgs.spiders'

ITEM_PIPELINES={
'DoubanImgs.pipelines.DoubanImgDownloadPipeline':300,
}
IMAGES_STORE='.'
IMAGES_EXPIRES=90

#Crawlresponsiblybyidentifyingyourself(andyourwebsite)ontheuser-agent
#USER_AGENT='DoubanImgs(+http://www.yourdomain.com)'

#Obeyrobots.txtrules
ROBOTSTXT_OBEY=False

#ConfiguremaximumconcurrentrequestsperformedbyScrapy(default:16)
#CONCURRENT_REQUESTS=32

#Configureadelayforrequestsforthesamewebsite(default:0)
#Seehttps://doc.scrapy.org/en/latest/topics/settings.html#download-delay
#Seealsoautothrottlesettingsanddocs
DOWNLOAD_DELAY=0.5
#Thedownloaddelaysettingwillhonoronlyoneof:
#CONCURRENT_REQUESTS_PER_DOMAIN=16
#CONCURRENT_REQUESTS_PER_IP=16

#Disablecookies(enabledbydefault)
#COOKIES_ENABLED=False

#DisableTelnetConsole(enabledbydefault)
#TELNETCONSOLE_ENABLED=False

#Overridethedefaultrequestheaders:
#DEFAULT_REQUEST_HEADERS={
#'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#'Accept-Language':'en',
#}

#Enableordisablespidermiddlewares
#Seehttps://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES={
#'DoubanImgs.middlewares.DoubanimgsSpiderMiddleware':543,
#}

#Enableordisabledownloadermiddlewares
#Seehttps://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES={
#'DoubanImgs.middlewares.DoubanimgsDownloaderMiddleware':543,
#}

#Enableordisableextensions
#Seehttps://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS={
#'scrapy.extensions.telnet.TelnetConsole':None,
#}

#Configureitempipelines
#Seehttps://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES={
#'DoubanImgs.pipelines.DoubanimgsPipeline':300,
#}

#EnableandconfiguretheAutoThrottleextension(disabledbydefault)
#Seehttps://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED=True
#Theinitialdownloaddelay
#AUTOTHROTTLE_START_DELAY=5
#Themaximumdownloaddelaytobesetincaseofhighlatencies
#AUTOTHROTTLE_MAX_DELAY=60
#TheaveragenumberofrequestsScrapyshouldbesendinginparallelto
#eachremoteserver
#AUTOTHROTTLE_TARGET_CONCURRENCY=1.0
#Enableshowingthrottlingstatsforeveryresponsereceived:
#AUTOTHROTTLE_DEBUG=False

#EnableandconfigureHTTPcaching(disabledbydefault)
#Seehttps://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED=True
#HTTPCACHE_EXPIRATION_SECS=0
#HTTPCACHE_DIR='httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES=[]
#HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'

vimitems.py

#-*-coding:utf-8-*-

#Defineherethemodelsforyourscrapeditems
#
#Seedocumentationin:
#https://doc.scrapy.org/en/latest/topics/items.html
importscrapy
fromscrapyimportField


classDoubanImgsItem(scrapy.Item):
#definethefieldsforyouritemherelike:
#name=scrapy.Field()
image_urls=Field()
images=Field()
image_paths=Field()

vimpipelines.py

#-*-coding:utf-8-*-

#Defineyouritempipelineshere
#
#Don'tforgettoaddyourpipelinetotheITEM_PIPELINESsetting
#See:http://doc.scrapy.org/en/latest/topics/item-pipeline.html
fromscrapy.pipelines.imagesimportImagesPipeline
fromscrapy.exceptionsimportDropItem
fromscrapyimportRequest
fromscrapyimportlog


classDoubanImgsPipeline(object):
defprocess_item(self,item,spider):
returnitem


classDoubanImgDownloadPipeline(ImagesPipeline):
default_headers={
'accept':'image/webp,image/*,*/*;q=0.8',
'accept-encoding':'gzip,deflate,sdch,br',
'accept-language':'zh-CN,zh;q=0.8,en;q=0.6',
'cookie':'bid=yQdC/AzTaCw',
'referer':'https://www.douban.com/photos/photo/2370443040/',
'user-agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_11_4)AppleWebKit/537.36(KHTML,likeGecko)Chrome/52.0.2743.116Safari/537.36',
}

defget_media_requests(self,item,info):
forimage_urlinitem['image_urls']:
self.default_headers['referer']=image_url
yieldRequest(image_url,headers=self.default_headers)

defitem_completed(self,results,item,info):
image_paths=[x['path']forok,xinresultsifok]
ifnotimage_paths:
raiseDropItem("Itemcontainsnoimages")
item['image_paths']=image_paths
returnitem

到此这篇关于使用scrapyImagesPipeline爬取图片资源的示例代码的文章就介绍到这了,更多相关scrapyImagesPipeline爬取图片内容请搜索毛票票以前的文章或继续浏览下面的相关文章希望大家以后多多支持毛票票！

使用scrapy ImagesPipeline爬取图片资源的示例代码

热门推荐

随机推荐