python爬取51job中hr的邮箱
本文实例为大家分享了python爬取51job中hr的邮箱具体代码,供大家参考,具体内容如下
#encoding=utf8 importurllib2 importcookielib importre importlxml.html from_astimportTryExcept fromwarningsimportcatch_warnings f=open('/root/Desktop/51-01.txt','a+') defread(city): url='http://www.51job.com/'+city cj=cookielib.MozillaCookieJar() cookie_support=urllib2.HTTPCookieProcessor(cj) opener=urllib2.build_opener(cookie_support) opener.addheaders=[('User-agent','Mozilla/5.0(X11;Linuxx86_64;rv:38.0)Gecko/20100101Firefox/38.0Iceweasel/38.3.0')] urllib2.install_opener(opener) response=urllib2.urlopen(url) http=response.read() rex='http://jobs.51job.com/hot/.*?html' value=re.findall(rex,http) foriinvalue: printi try: readpage(i) except: pass defreadpage(url): cj=cookielib.MozillaCookieJar() cookie_support=urllib2.HTTPCookieProcessor(cj) opener=urllib2.build_opener(cookie_support) opener.addheaders=[('User-agent','Mozilla/5.0(X11;Linuxx86_64;rv:38.0)Gecko/20100101Firefox/38.0Iceweasel/38.3.0')] urllib2.install_opener(opener) html=urllib2.urlopen(url,timeout=2).read() doc=lxml.html.fromstring(html) rex=r'[\w\.-]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+' results=doc.xpath('//div[@class="tmsginbox"]/div[@class="con_msg"]/div[@class="in"]/p/text()') foriinresults: xx=re.compile(rex) forjinxx.findall(i): printj f.write(j+'\n') f.flush() if__name__=='__main__': city_list=['zhangjiagang','zhanjiang','zhaoqing','zibo'] foriincity_list: f.write(i+'\n') f.flush() try: read(i) except: pass f.flush() f.close()
city_list大家自己整理一下,只能帮你们到这里了,谢谢大家的阅读,继续关注毛票票更多精彩内容。