python爬取微信公众号文章
本文实例为大家分享了python爬取微信公众号文章的具体代码,供大家参考,具体内容如下
#-*-coding:utf-8-*- importrequests frombs4importBeautifulSoup fromrequests.exceptionsimportRequestException importtime importrandom importMySQLdb importthreading importsocket importmath socket.setdefaulttimeout(60)#这里对整个socket层设置超时时间。后续文件中如果再使用到socket,不必再设置 glock=threading.Lock()#定义全局锁 CATEGORY_URL=['http://www.we123.com/gzh/onclick/']#获取地区分类链接 all_url=[]# ALL_URLS=[]#所有详细页面链接 proxy_list=[]#IP池 URL='http://www.we123.com' PAGE_URL=[]#所有分页链接 #获取Ip池 defget_ip(): headers={'User-Agent':'Mozilla/5.0(WindowsNT6.1;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/58.0.3029.110Safari/537.36'} url='http://http-webapi.zhimaruanjian.com'#可以使用芝麻代理,好用稳定还不贵 resp=requests.get(url,headers=headers) obj=resp.json()#获取jsonip池对象 foripinobj: arr='http://'+str(ip['ip'])+':'+str(ip['port']) proxy_list.append(arr) #获取页面源码函数 defget_html(url): #headers={} user_agent_list=[ 'Mozilla/5.0(WindowsNT10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/53.0.2785.104Safari/537.36Core/1.53.3538.400QQBrowser/9.6.12501.400', 'Mozilla/5.0(WindowsNT10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/58.0.3029.110Safari/537.36', 'Mozilla/5.0(WindowsNT10.0;Win64;x64;rv:56.0)Gecko/20100101Firefox/56.0' ] #user_agent=random.choice(user_agent_list) headers={ 'User-Agent':'Mozilla/5.0(WindowsNT10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/53.0.2785.104Safari/537.36Core/1.53.3538.400QQBrowser/9.6.12501.400' } #代理,免费的代理只能维持一会可能就没用了,自行更换 #proxy_list=[ #"http://27.192.185.62:3252", #] #proxy_ip=random.choice(proxy_list) #proxies={'http':proxy_ip} #print(str(url)) try: resp=requests.get(url,headers=headers) #print("72行:status_code="+str(resp.status_code)) #print(type(resp.text)) #print(resp.url)#请求的url ifresp.status_code==200: returnresp elifresp.status_code==404: returnresp elifresp.status_code==500: returnresp returnresp exceptRuntimeError: print("超时") return"error" exceptConnectionError: print("连接超时") return"error" exceptRequestException: print("http请求父类错误") withopen('url_exception.txt','a+',encoding='utf-8')asf: f.write(str(url)) f.write('\n') return"error" #获取区域分类链接 defget_categoty_url(): url='http://www.we123.com/gzh/onclick/' resp=get_html(url) soup=BeautifulSoup(resp.text,'lxml') html=soup.select('div.div-subs2>div.divst-content>div.divst-subs>li>a') #获取区域分类链接 foriinhtml: city=i['href'].split("/")[-1] if(city=='海外'orcity=='台湾'orcity=='澳门'): continue url=URL+i['href'] CATEGORY_URL.append(url) print(CATEGORY_URL) #获取每个区域下所有分页链接 defget_page_url(url): city=url.split('/')[-1] html=get_html(url) ifhtml=="error": print("98行:connecturlerror") time.sleep(random.randint(10,20)) return"error" soup=BeautifulSoup(html.text,'lxml') #获取总条数 all_nums=soup.select("div.page>a>b") iflen(all_nums)==0: return"error" else: all_nums=soup.select("div.page>a>b")[0].get_text() #获取总分页数 all_pages=math.ceil((int(all_nums)/30)) #获取所有分页链接 all_page_url=[] foriinrange(0,int(all_pages)): page_url='http://www.we123.com/e/action/ListInfo.php?page='+str(i)+'&classid=45&line=30&tempid=10&orderby=onclick&myorder=0&totalnum='+str(all_nums) all_page_url.append(page_url) returnall_page_url #获取所有详细页面链接 defget_page_urls(): globalPAGE_URL c_url=CATEGORY_URL.pop() print('121行:请求链接'+c_url) PAGE_URL=get_page_url(c_url)#获取每个区域下面的所有分页链接 #获取所有详细页面链接 defget_info_urls(): whileTrue: globalPAGE_URL#设置全局变量 glock.acquire()#加锁 iflen(PAGE_URL)==0: glock.release()#解锁 print('131行:CATEGORY_URL为空') break else: p_url=PAGE_URL.pop() print('135行:请求链接'+p_url) glock.release()#解锁 glock.acquire()#加锁 html=get_html(p_url) ifhtml=="error": print("141行:connecturlerror") time.sleep(2) return soup=BeautifulSoup(html.text,'lxml') info_urls=soup.select('div.gzhRight>div.gzh_list>ul>li>a') forxininfo_urls: i_url=URL+x['href'] ALL_URLS.append(i_url) print("库存链接共:"+str(len(ALL_URLS))) glock.release()#解锁 #获取每一页需要的数据 defget_data(): whileTrue: globalALL_URLS#设置全局变量 glock.acquire()#加锁 print("当前库存:"+str(len(ALL_URLS))) iflen(ALL_URLS)==0: glock.release()#解锁 print('159行:ALL_URLS为空') break else: url=ALL_URLS.pop() print("开始抓取数据:"+url) glock.release()#解锁 time.sleep(1)#睡眠1秒钟 html=get_html(url) ifhtml=="error": print("168行:connecturlerror") time.sleep(random.randint(2,4)) return html.encoding='utf-8'#显式地指定网页编码,一般情况可以不用 soup=BeautifulSoup(html.text,'lxml') #公众号名称 names=soup.select('div.artcleLeft>div.xcxnry>div.xcxtop>div.xcxtop_left>div.gzhtop_logo>h1') #微信号id accounts=[] accounts.append(soup.select('div.artcleLeft>div.xcxnry>div.xcxtop>div.xcxtop_left>div.gzhtop_logo>p')[0]) #微信头像 imgs=soup.select('div.artcleLeft>div.xcxnry>div.xcxtop>div.xcxtop_left>div.gzhtop_logo>img') #公众号二维码 QR_codes=soup.select('div.artcleLeft>div.xcxnry>div.xcxtop>div.xcxtop_right>img') #介绍 descs=soup.select('div.artcleLeft>div.xcxnry>div.xcxinfo') #公众号分类 categorys=[] category='' cate=soup.select('div.artcleLeft>div.xcxnry>div.xcxtop>div.xcxtop_left>div.xcx_p>span>a') ifnotlen(cate)==0: category=cate[0].get_text() else: category='综合' glock.acquire()#加锁 forname,account,img,QR_code,descinzip(names,accounts,imgs,QR_codes,descs): data={ 'name':name.get_text(), 'category':category, 'account':account.get_text().split(":")[-1], 'img':img['src'], 'QR_code':QR_code['src'], 'desc':desc.get_text() } add_data(data,url) glock.release()#解锁 #添加数据 defadd_data(data,url): con=MySQLdb.connect('127.0.0.1','root','root','test',charset="utf8",use_unicode=True) cursor=con.cursor() #exit() insert_sql=""" insertignoreintoweixin5(w_name,category,account,img,QR_code,introduce) VALUES(%s,%s,%s,%s,%s,%s) """ print('212行:'+data['name']+'_'+data['account']+'添加成功!-'+url) try: cursor.execute(insert_sql,(data['name'],data['category'],data['account'],data['img'],data['QR_code'],str(data['desc']))) con.commit() except: ALL_URLS.insert(0,url) print("218行:"+URL+'插入失败') con.rollback() con.close() #将时间字符串转化为时间戳 deftime_to(dt): timeArray=time.strptime(dt,"%Y年%m月%d日") timestamp=int(time.mktime(timeArray)) returntimestamp #启动多线程爬取 defmain(): forxinrange(3): th=threading.Thread(target=get_info_urls) th.start() #get_info_urls() time.sleep(3) forxinrange(5): th=threading.Thread(target=get_data) th.start() if__name__=='__main__': #计时 t1=time.time() #调用函数 get_ip()#获取ip池 get_page_urls() time.sleep(2) #get_categoty_url() main() print(time.time()-t1)
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。