python实现提取百度搜索结果的方法
本文实例讲述了python实现提取百度搜索结果的方法。分享给大家供大家参考。具体实现方法如下:
#coding=utf8
importurllib2
importstring
importurllib
importre
importrandom
#设置多个user_agents,防止百度限制IP
user_agents=['Mozilla/5.0(WindowsNT6.1;WOW64;rv:23.0)Gecko/20130406Firefox/23.0',\
'Mozilla/5.0(WindowsNT6.1;WOW64;rv:18.0)Gecko/20100101Firefox/18.0',\
'Mozilla/5.0(Windows;U;WindowsNT6.1;en-US)AppleWebKit/533+\
(KHTML,likeGecko)ElementBrowser5.0',\
'IBMWebExplorer/v0.94','Galaxy/1.0[en](MacOSX10.5.6;U;en)',\
'Mozilla/5.0(compatible;MSIE10.0;WindowsNT6.1;WOW64;Trident/6.0)',\
'Opera/9.80(WindowsNT6.0)Presto/2.12.388Version/12.14',\
'Mozilla/5.0(iPad;CPUOS6_0likeMacOSX)AppleWebKit/536.26(KHTML,likeGecko)\
Version/6.0Mobile/10A5355dSafari/8536.25',\
'Mozilla/5.0(WindowsNT6.1)AppleWebKit/537.36(KHTML,likeGecko)\
Chrome/28.0.1468.0Safari/537.36',\
'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.0;Trident/5.0;TheWorld)']
defbaidu_search(keyword,pn):
p={'wd':keyword}
res=urllib2.urlopen(("http://www.baidu.com/s?"+urllib.urlencode(p)+"&pn={0}&cl=3&rn=100").format(pn))
html=res.read()
returnhtml
defgetList(regex,text):
arr=[]
res=re.findall(regex,text)
ifres:
forrinres:
arr.append(r)
returnarr
defgetMatch(regex,text):
res=re.findall(regex,text)
ifres:
returnres[0]
return""
defclearTag(text):
p=re.compile(u'<[^>]+>')
retval=p.sub("",text)
returnretval
defgeturl(keyword):
forpageinrange(10):
pn=page*100+1
html=baidu_search(keyword,pn)
content=unicode(html,'utf-8','ignore')
arrList=getList(u"<table.*?class=\"result\".*?>.*?<\/a>",content)
foriteminarrList:
regex=u"<h3.*?class=\"t\".*?><a.*?href=\"(.*?)\".*?>(.*?)<\/a>"
link=getMatch(regex,item)
url=link[0]
#获取标题
#title=clearTag(link[1]).encode('utf8')
try:
domain=urllib2.Request(url)
r=random.randint(0,11)
domain.add_header('User-agent',user_agents[r])
domain.add_header('connection','keep-alive')
response=urllib2.urlopen(domain)
uri=response.geturl()
printuri
except:
continue
if__name__=='__main__':
geturl('python')
希望本文所述对大家的Python程序设计有所帮助。