Python抓取聚划算商品分析页面获取商品信息并以XML格式保存到本地
本文实例为大家分享了Python抓取聚划算商品页面获取商品信息并保存的具体代码,供大家参考,具体内容如下
#!/user/bin/python #-*-coding:gbk-*- #Spider.py importurllib2 importhttplib importStringIO importgzip importre importchardet importsys importos importdatetime fromxml.dom.minidomimportDocument fromBeautifulSoupimportBeautifulSoup ##这段代码是用于解决控制台打印汉字报错的问题 reload(sys) sys.setdefaultencoding("utf8") ##################################################### ##debug模式开关,开启后可以看到Http请求的头部信息以及debug日志 DEBUG=1 NO_DEBUG=0 httplib.HTTPConnection.debuglevel=DEBUG ##是否显示爬取网页源代码开关 showSrcCode=False ##压缩方式 ZIP_TYPE="gzip" fileName="auctions" location="d://spiderData/" ##header headerConfig={"User-Agent":"taobao-yanyuan.qzs","Accept-encoding":ZIP_TYPE} ##################################################### #############classSpiderConfig##################### classSpiderConfig: """ configurationforspidernameandurl """ def__init__(self,name,url): self.name=name self.url=url ##################################################### ##############classSpiderAuctionDomain############## classSpiderAuctionDomain: """ Storeinformationwithauctionsspideredbypython """ title="" url="" img="" price="" def__init__(self): pass ##################################################### ########classSpiderDefaultErrorHandler############## classSpiderDefaultErrorHandler(urllib2.HTTPDefaultErrorHandler): defhttp_error_default(self,req,fp,code,msg,hdrs): """ defaulterrorprocesshandlerforspider """ result=urllib2.HTTPError(req.get_full_url(),code,msg,hdrs,fp) result.status=code result.url=req.get_full_url() print"<",result.url,"Exceptioncode:",result.status,">" returnresult ##################################################### #############classSpiderHandler##################### classSpiderHandler: """ spiderhandler """ defspider(self,spiderConfig): try: request=urllib2.Request(spiderConfig.url) ##configurerequesthreader forkey,valinheaderConfig.items(): request.add_header(key,val) ##buildopener opener=urllib2.build_opener(SpiderDefaultErrorHandler()) ##openrequest openRequest=opener.open(request) ##readdata spiderData=openRequest.read() ##close opener.close() if0==len(spiderData): return ifZIP_TYPE==openRequest.headers.get("Content-Encoding"): spiderData=SpiderHandler.gzipData(self,spiderData) ifhttplib.HTTPConnection.debuglevel==DEBUGandshowSrcCode: printspiderData #parsehtml SpiderHandler.parse(self,spiderData) exceptException,x: print"spiderprocessException:",x defparse(self,spiderData): """ parsehtmlcontent """ ifhttplib.HTTPConnection.debuglevel==DEBUG: charsetAnalyze=chardet.detect(spiderData) print"analyzespiderdataencode:",charsetAnalyze["encoding"] print"执行解析",fileName soup=BeautifulSoup(spiderData) encode=soup.originalEncoding encoding=lambdax:x.encode(encode) ifhttplib.HTTPConnection.debuglevel==DEBUG: print"识别到编码:",encode title=soup.head.title.string printencoding(title) spiderContents=soup.findAll(name="div",attrs={"class":"main-boxavil"}) auctions=["%s"%sforsinspiderContents] ifauctionsisNone: return auctionList=[] foraucinauctions: auctionDomain=SpiderAuctionDomain() #parseauctionlink links=re.search(re.compile(r']*)[\"|\']',re.IGNORECASE),auc) iflinksisnotNone: auctionDomain.link=encoding("http://ju.taobao.com/tg/life_home.htm?item_id="+"".join(["%s"%sforsinlinks.groups()iflen(s)>0])) #parseauctiontitle titles=re.search(re.compile(r"([^>]*)",re.IGNORECASE),auc) iftitlesisnotNone: auctionDomain.title=encoding("".join(["%s"%tfortintitles.groups()iflen(t)>0])) #parseauctionprice price=re.search(re.compile(r" ([^<]*)",re.IGNORECASE),auc) ifpriceisnotNone: auctionDomain.price="".join(["%s"%pforpinprice.groups()iflen(p)>0]) #parseimageurl imgs=re.search(re.compile(r" ]*)[\'\"]",re.IGNORECASE),auc) ifimgsisnotNone: auctionDomain.img="".join(["%s"%iforiinimgs.groups()iflen(i)>0]) auctionList.append(auctionDomain) print"成功解析商品信息:" forainauctionList: print"--->",a.title #sortauctionlist auctionList=SpiderHandler.sortAuctionList(self,auctionList) #saveinfile SpiderHandler.save(self,auctionList) print"解析完成" pass defsortAuctionList(self,auctionList): """ 冒泡排序,按照价格排序 """ length=len(auctionList) iflength<2: returnauctionList else: foriinrange(length-1): forjinrange(length-i-1): iffloat(auctionList[j].price)>float(auctionList[j+1].price): auctionList[j],auctionList[j+1]=auctionList[j+1],auctionList[j] returnauctionList pass defsave(self,auctionList): ifauctionListisnotNone: doc=Document() auctions=doc.createElement("auctions") doc.appendChild(auctions) foraucinauctionList: auction=doc.createElement("auction") auctions.appendChild(auction) SpiderHandler.generateXML(self,doc,auction,"title",auc.title) SpiderHandler.generateXML(self,doc,auction,"price",auc.price) SpiderHandler.generateXML(self,doc,auction,"img",auc.img) SpiderHandler.generateXML(self,doc,auction,"link",auc.link) ifFalse==os.path.exists(location): os.mkdir(location) file=open(location+fileName+".xml",'w') file.write(doc.toprettyxml()) file.close() ifhttplib.HTTPConnection.debuglevel==DEBUG: printdoc.toprettyxml() defgenerateXML(self,doc,f,name,txt): c=doc.createElement(name) f.appendChild(c) c.appendChild(doc.createTextNode(txt)) defgzipData(self,spiderData): """ getdatafromgzip """ if0==len(spiderData): returnspiderData spiderDataStream=StringIO.StringIO(spiderData) spiderData=gzip.GzipFile(fileobj=spiderDataStream).read() returnspiderData ##################################################### if__name__=="__main__": nowtime=lambda:datetime.datetime.strftime(datetime.datetime.now(),"%Y年%m月%d日%H时%m分%S秒") needSpiderUrl={"suzhou":"http://ju.taobao.com/suzhou", "hangzhou":"http://ju.taobao.com/hangzhou", "shanghai":"http://ju.taobao.com/shanghai", "beijing":"http://ju.taobao.com/beijing", "chengdu":"http://ju.taobao.com/chengdu"} configList=[] fork,vinneedSpiderUrl.items(): spiderConfig=SpiderConfig(k,v) configList.append(spiderConfig) spiderHandler=SpiderHandler() print"爬虫执行开始时间:",nowtime() forspiderConfiginconfigList: fileName=spiderConfig.name spiderHandler.spider(spiderConfig) print"爬虫执行完毕时间:",nowtime()
更多内容请参考专题《python爬取功能汇总》进行学习。
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。