Python自定义scrapy中间模块避免重复采集的方法
本文实例讲述了Python自定义scrapy中间模块避免重复采集的方法。分享给大家供大家参考。具体如下:
fromscrapyimportlog fromscrapy.httpimportRequest fromscrapy.itemimportBaseItem fromscrapy.utils.requestimportrequest_fingerprint frommyproject.itemsimportMyItem classIgnoreVisitedItems(object): """Middlewaretoignorere-visitingitempagesifthey werealreadyvisitedbefore. Therequeststobefilteredbyhaveameta['filter_visited'] flagenabledandoptionallydefineanidtouse foridentifyingthem,whichdefaultstherequestfingerprint, althoughyou'dwanttousetheitemid, ifyoualreadyhaveitbeforehandtomakeitmorerobust. """ FILTER_VISITED='filter_visited' VISITED_ID='visited_id' CONTEXT_KEY='visited_ids' defprocess_spider_output(self,response,result,spider): context=getattr(spider,'context',{}) visited_ids=context.setdefault(self.CONTEXT_KEY,{}) ret=[] forxinresult: visited=False ifisinstance(x,Request): ifself.FILTER_VISITEDinx.meta: visit_id=self._visited_id(x) ifvisit_idinvisited_ids: log.msg("Ignoringalreadyvisited:%s"%x.url, level=log.INFO,spider=spider) visited=True elifisinstance(x,BaseItem): visit_id=self._visited_id(response.request) ifvisit_id: visited_ids[visit_id]=True x['visit_id']=visit_id x['visit_status']='new' ifvisited: ret.append(MyItem(visit_id=visit_id,visit_status='old')) else: ret.append(x) returnret def_visited_id(self,request): returnrequest.meta.get(self.VISITED_ID)orrequest_fingerprint(request)
希望本文所述对大家的Python程序设计有所帮助。