Python自定义scrapy中间模块避免重复采集的方法
本文实例讲述了Python自定义scrapy中间模块避免重复采集的方法。分享给大家供大家参考。具体如下:
fromscrapyimportlog
fromscrapy.httpimportRequest
fromscrapy.itemimportBaseItem
fromscrapy.utils.requestimportrequest_fingerprint
frommyproject.itemsimportMyItem
classIgnoreVisitedItems(object):
"""Middlewaretoignorere-visitingitempagesifthey
werealreadyvisitedbefore.
Therequeststobefilteredbyhaveameta['filter_visited']
flagenabledandoptionallydefineanidtouse
foridentifyingthem,whichdefaultstherequestfingerprint,
althoughyou'dwanttousetheitemid,
ifyoualreadyhaveitbeforehandtomakeitmorerobust.
"""
FILTER_VISITED='filter_visited'
VISITED_ID='visited_id'
CONTEXT_KEY='visited_ids'
defprocess_spider_output(self,response,result,spider):
context=getattr(spider,'context',{})
visited_ids=context.setdefault(self.CONTEXT_KEY,{})
ret=[]
forxinresult:
visited=False
ifisinstance(x,Request):
ifself.FILTER_VISITEDinx.meta:
visit_id=self._visited_id(x)
ifvisit_idinvisited_ids:
log.msg("Ignoringalreadyvisited:%s"%x.url,
level=log.INFO,spider=spider)
visited=True
elifisinstance(x,BaseItem):
visit_id=self._visited_id(response.request)
ifvisit_id:
visited_ids[visit_id]=True
x['visit_id']=visit_id
x['visit_status']='new'
ifvisited:
ret.append(MyItem(visit_id=visit_id,visit_status='old'))
else:
ret.append(x)
returnret
def_visited_id(self,request):
returnrequest.meta.get(self.VISITED_ID)orrequest_fingerprint(request)
希望本文所述对大家的Python程序设计有所帮助。