python制作最美应用的爬虫
安卓最美应用页面爬虫,爬虫很简单,设计的东西到挺多的
文件操作
正则表达式
字符串替换等等
importrequests
importre
url="http://zuimeia.com"
r=requests.get('http://zuimeia.com/community/app/hot/?platform=2')
pattern=re.compile(r'<aclass="community-app-cover-wrapper"href="(.*?)"target="_blank">')
urlList=pattern.findall(r.content)
defrequestsUrl(url):
r=requests.get(url)
title=re.findall(r'"app-title"><h1>(.*?)</h1>',r.content)
#printtitle
category=re.findall(r'<aclass="app-tag"href="/community/app/category/title/.*?/?platform=2">(.*?)</a>',r.content)
#printcategory
describe=re.findall(r'<divid="article_content">(.*?)<divclass="community-image-wrapper">',r.content)
#printtype(describe[0])
strdescribe=srtReplace(describe[0])
#printstrdescribe
downloadUrl=re.findall(r'<aclass="download-buttondirecthidden"href="(.*?)"',r.content)
#printdownloadUrl
returntitle,category,strdescribe,downloadUrl
defsrtReplace(string):
listReplace=['<p>','<br>','<h1>','<h2>','<h3>','<h4>','<h5>','<h6>','<h7>','<strong>','</p>','<br/>','</h1>','</h2>','</h3>','</h4>','</h5>',
'</h6>','</h7>','</strong>','<b>','</b>']
foreachListReplaceinlistReplace:
string=string.replace(str(eachListReplace),'\n')
string=string.replace('\n\n','')
returnstring
defcategornFinal(category):
categoryFinal=''
foreachCategoryincategory:
categoryFinal=categoryFinal+str(eachCategory)+'-->'
returncategoryFinal
defurlReplace(url):
url=url.replace('&','&')
returnurl
requestsUrl("http://zuimeia.com/community/app/27369/?platform=2")
foreachUrlinurlList:
eachUrl=url+eachUrl
content=requestsUrl(eachUrl)
categoryFinal=''
title=content[0][0]
category=categornFinal(content[1])
strdescribe=content[2]
downloadUrl=urlReplace(content[3][0])
withopen('c:/wqa.txt','a+')asfd:
fd.write('title:'+title+'\n'+'category:'+category+'\n'+'strdescribe:'+strdescribe+'\n'+'downloadUrl:'+downloadUrl+'\n\n\n-----------------------------------------------------------------------------------------------------------------------------\n\n\n')