Python抓取聚划算商品分析页面获取商品信息并以XML格式保存到本地
本文实例为大家分享了Python抓取聚划算商品页面获取商品信息并保存的具体代码,供大家参考,具体内容如下
#!/user/bin/python
#-*-coding:gbk-*-
#Spider.py
importurllib2
importhttplib
importStringIO
importgzip
importre
importchardet
importsys
importos
importdatetime
fromxml.dom.minidomimportDocument
fromBeautifulSoupimportBeautifulSoup
##这段代码是用于解决控制台打印汉字报错的问题
reload(sys)
sys.setdefaultencoding("utf8")
#####################################################
##debug模式开关,开启后可以看到Http请求的头部信息以及debug日志
DEBUG=1
NO_DEBUG=0
httplib.HTTPConnection.debuglevel=DEBUG
##是否显示爬取网页源代码开关
showSrcCode=False
##压缩方式
ZIP_TYPE="gzip"
fileName="auctions"
location="d://spiderData/"
##header
headerConfig={"User-Agent":"taobao-yanyuan.qzs","Accept-encoding":ZIP_TYPE}
#####################################################
#############classSpiderConfig#####################
classSpiderConfig:
"""
configurationforspidernameandurl
"""
def__init__(self,name,url):
self.name=name
self.url=url
#####################################################
##############classSpiderAuctionDomain##############
classSpiderAuctionDomain:
"""
Storeinformationwithauctionsspideredbypython
"""
title=""
url=""
img=""
price=""
def__init__(self):
pass
#####################################################
########classSpiderDefaultErrorHandler##############
classSpiderDefaultErrorHandler(urllib2.HTTPDefaultErrorHandler):
defhttp_error_default(self,req,fp,code,msg,hdrs):
"""
defaulterrorprocesshandlerforspider
"""
result=urllib2.HTTPError(req.get_full_url(),code,msg,hdrs,fp)
result.status=code
result.url=req.get_full_url()
print"<",result.url,"Exceptioncode:",result.status,">"
returnresult
#####################################################
#############classSpiderHandler#####################
classSpiderHandler:
"""
spiderhandler
"""
defspider(self,spiderConfig):
try:
request=urllib2.Request(spiderConfig.url)
##configurerequesthreader
forkey,valinheaderConfig.items():
request.add_header(key,val)
##buildopener
opener=urllib2.build_opener(SpiderDefaultErrorHandler())
##openrequest
openRequest=opener.open(request)
##readdata
spiderData=openRequest.read()
##close
opener.close()
if0==len(spiderData):
return
ifZIP_TYPE==openRequest.headers.get("Content-Encoding"):
spiderData=SpiderHandler.gzipData(self,spiderData)
ifhttplib.HTTPConnection.debuglevel==DEBUGandshowSrcCode:
printspiderData
#parsehtml
SpiderHandler.parse(self,spiderData)
exceptException,x:
print"spiderprocessException:",x
defparse(self,spiderData):
"""
parsehtmlcontent
"""
ifhttplib.HTTPConnection.debuglevel==DEBUG:
charsetAnalyze=chardet.detect(spiderData)
print"analyzespiderdataencode:",charsetAnalyze["encoding"]
print"执行解析",fileName
soup=BeautifulSoup(spiderData)
encode=soup.originalEncoding
encoding=lambdax:x.encode(encode)
ifhttplib.HTTPConnection.debuglevel==DEBUG:
print"识别到编码:",encode
title=soup.head.title.string
printencoding(title)
spiderContents=soup.findAll(name="div",attrs={"class":"main-boxavil"})
auctions=["%s"%sforsinspiderContents]
ifauctionsisNone:
return
auctionList=[]
foraucinauctions:
auctionDomain=SpiderAuctionDomain()
#parseauctionlink
links=re.search(re.compile(r']*)[\"|\']',re.IGNORECASE),auc)
iflinksisnotNone:
auctionDomain.link=encoding("http://ju.taobao.com/tg/life_home.htm?item_id="+"".join(["%s"%sforsinlinks.groups()iflen(s)>0]))
#parseauctiontitle
titles=re.search(re.compile(r"([^>]*)",re.IGNORECASE),auc)
iftitlesisnotNone:
auctionDomain.title=encoding("".join(["%s"%tfortintitles.groups()iflen(t)>0]))
#parseauctionprice
price=re.search(re.compile(r"([^<]*)",re.IGNORECASE),auc)
ifpriceisnotNone:
auctionDomain.price="".join(["%s"%pforpinprice.groups()iflen(p)>0])
#parseimageurl
imgs=re.search(re.compile(r"]*)[\'\"]",re.IGNORECASE),auc)
ifimgsisnotNone:
auctionDomain.img="".join(["%s"%iforiinimgs.groups()iflen(i)>0])
auctionList.append(auctionDomain)
print"成功解析商品信息:"
forainauctionList:
print"--->",a.title
#sortauctionlist
auctionList=SpiderHandler.sortAuctionList(self,auctionList)
#saveinfile
SpiderHandler.save(self,auctionList)
print"解析完成"
pass
defsortAuctionList(self,auctionList):
"""
冒泡排序,按照价格排序
"""
length=len(auctionList)
iflength<2:
returnauctionList
else:
foriinrange(length-1):
forjinrange(length-i-1):
iffloat(auctionList[j].price)>float(auctionList[j+1].price):
auctionList[j],auctionList[j+1]=auctionList[j+1],auctionList[j]
returnauctionList
pass
defsave(self,auctionList):
ifauctionListisnotNone:
doc=Document()
auctions=doc.createElement("auctions")
doc.appendChild(auctions)
foraucinauctionList:
auction=doc.createElement("auction")
auctions.appendChild(auction)
SpiderHandler.generateXML(self,doc,auction,"title",auc.title)
SpiderHandler.generateXML(self,doc,auction,"price",auc.price)
SpiderHandler.generateXML(self,doc,auction,"img",auc.img)
SpiderHandler.generateXML(self,doc,auction,"link",auc.link)
ifFalse==os.path.exists(location):
os.mkdir(location)
file=open(location+fileName+".xml",'w')
file.write(doc.toprettyxml())
file.close()
ifhttplib.HTTPConnection.debuglevel==DEBUG:
printdoc.toprettyxml()
defgenerateXML(self,doc,f,name,txt):
c=doc.createElement(name)
f.appendChild(c)
c.appendChild(doc.createTextNode(txt))
defgzipData(self,spiderData):
"""
getdatafromgzip
"""
if0==len(spiderData):
returnspiderData
spiderDataStream=StringIO.StringIO(spiderData)
spiderData=gzip.GzipFile(fileobj=spiderDataStream).read()
returnspiderData
#####################################################
if__name__=="__main__":
nowtime=lambda:datetime.datetime.strftime(datetime.datetime.now(),"%Y年%m月%d日%H时%m分%S秒")
needSpiderUrl={"suzhou":"http://ju.taobao.com/suzhou",
"hangzhou":"http://ju.taobao.com/hangzhou",
"shanghai":"http://ju.taobao.com/shanghai",
"beijing":"http://ju.taobao.com/beijing",
"chengdu":"http://ju.taobao.com/chengdu"}
configList=[]
fork,vinneedSpiderUrl.items():
spiderConfig=SpiderConfig(k,v)
configList.append(spiderConfig)
spiderHandler=SpiderHandler()
print"爬虫执行开始时间:",nowtime()
forspiderConfiginconfigList:
fileName=spiderConfig.name
spiderHandler.spider(spiderConfig)
print"爬虫执行完毕时间:",nowtime()
更多内容请参考专题《python爬取功能汇总》进行学习。
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。