python爬取51job中hr的邮箱
本文实例为大家分享了python爬取51job中hr的邮箱具体代码,供大家参考,具体内容如下
#encoding=utf8
importurllib2
importcookielib
importre
importlxml.html
from_astimportTryExcept
fromwarningsimportcatch_warnings
f=open('/root/Desktop/51-01.txt','a+')
defread(city):
url='http://www.51job.com/'+city
cj=cookielib.MozillaCookieJar()
cookie_support=urllib2.HTTPCookieProcessor(cj)
opener=urllib2.build_opener(cookie_support)
opener.addheaders=[('User-agent','Mozilla/5.0(X11;Linuxx86_64;rv:38.0)Gecko/20100101Firefox/38.0Iceweasel/38.3.0')]
urllib2.install_opener(opener)
response=urllib2.urlopen(url)
http=response.read()
rex='http://jobs.51job.com/hot/.*?html'
value=re.findall(rex,http)
foriinvalue:
printi
try:
readpage(i)
except:
pass
defreadpage(url):
cj=cookielib.MozillaCookieJar()
cookie_support=urllib2.HTTPCookieProcessor(cj)
opener=urllib2.build_opener(cookie_support)
opener.addheaders=[('User-agent','Mozilla/5.0(X11;Linuxx86_64;rv:38.0)Gecko/20100101Firefox/38.0Iceweasel/38.3.0')]
urllib2.install_opener(opener)
html=urllib2.urlopen(url,timeout=2).read()
doc=lxml.html.fromstring(html)
rex=r'[\w\.-]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+'
results=doc.xpath('//div[@class="tmsginbox"]/div[@class="con_msg"]/div[@class="in"]/p/text()')
foriinresults:
xx=re.compile(rex)
forjinxx.findall(i):
printj
f.write(j+'\n')
f.flush()
if__name__=='__main__':
city_list=['zhangjiagang','zhanjiang','zhaoqing','zibo']
foriincity_list:
f.write(i+'\n')
f.flush()
try:
read(i)
except:
pass
f.flush()
f.close()
city_list大家自己整理一下,只能帮你们到这里了,谢谢大家的阅读,继续关注毛票票更多精彩内容。