python爬取微信公众号文章
本文实例为大家分享了python爬取微信公众号文章的具体代码,供大家参考,具体内容如下
#-*-coding:utf-8-*-
importrequests
frombs4importBeautifulSoup
fromrequests.exceptionsimportRequestException
importtime
importrandom
importMySQLdb
importthreading
importsocket
importmath
socket.setdefaulttimeout(60)#这里对整个socket层设置超时时间。后续文件中如果再使用到socket,不必再设置
glock=threading.Lock()#定义全局锁
CATEGORY_URL=['http://www.we123.com/gzh/onclick/']#获取地区分类链接
all_url=[]#
ALL_URLS=[]#所有详细页面链接
proxy_list=[]#IP池
URL='http://www.we123.com'
PAGE_URL=[]#所有分页链接
#获取Ip池
defget_ip():
headers={'User-Agent':'Mozilla/5.0(WindowsNT6.1;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/58.0.3029.110Safari/537.36'}
url='http://http-webapi.zhimaruanjian.com'#可以使用芝麻代理,好用稳定还不贵
resp=requests.get(url,headers=headers)
obj=resp.json()#获取jsonip池对象
foripinobj:
arr='http://'+str(ip['ip'])+':'+str(ip['port'])
proxy_list.append(arr)
#获取页面源码函数
defget_html(url):
#headers={}
user_agent_list=[
'Mozilla/5.0(WindowsNT10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/53.0.2785.104Safari/537.36Core/1.53.3538.400QQBrowser/9.6.12501.400',
'Mozilla/5.0(WindowsNT10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/58.0.3029.110Safari/537.36',
'Mozilla/5.0(WindowsNT10.0;Win64;x64;rv:56.0)Gecko/20100101Firefox/56.0'
]
#user_agent=random.choice(user_agent_list)
headers={
'User-Agent':'Mozilla/5.0(WindowsNT10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/53.0.2785.104Safari/537.36Core/1.53.3538.400QQBrowser/9.6.12501.400'
}
#代理,免费的代理只能维持一会可能就没用了,自行更换
#proxy_list=[
#"http://27.192.185.62:3252",
#]
#proxy_ip=random.choice(proxy_list)
#proxies={'http':proxy_ip}
#print(str(url))
try:
resp=requests.get(url,headers=headers)
#print("72行:status_code="+str(resp.status_code))
#print(type(resp.text))
#print(resp.url)#请求的url
ifresp.status_code==200:
returnresp
elifresp.status_code==404:
returnresp
elifresp.status_code==500:
returnresp
returnresp
exceptRuntimeError:
print("超时")
return"error"
exceptConnectionError:
print("连接超时")
return"error"
exceptRequestException:
print("http请求父类错误")
withopen('url_exception.txt','a+',encoding='utf-8')asf:
f.write(str(url))
f.write('\n')
return"error"
#获取区域分类链接
defget_categoty_url():
url='http://www.we123.com/gzh/onclick/'
resp=get_html(url)
soup=BeautifulSoup(resp.text,'lxml')
html=soup.select('div.div-subs2>div.divst-content>div.divst-subs>li>a')
#获取区域分类链接
foriinhtml:
city=i['href'].split("/")[-1]
if(city=='海外'orcity=='台湾'orcity=='澳门'):
continue
url=URL+i['href']
CATEGORY_URL.append(url)
print(CATEGORY_URL)
#获取每个区域下所有分页链接
defget_page_url(url):
city=url.split('/')[-1]
html=get_html(url)
ifhtml=="error":
print("98行:connecturlerror")
time.sleep(random.randint(10,20))
return"error"
soup=BeautifulSoup(html.text,'lxml')
#获取总条数
all_nums=soup.select("div.page>a>b")
iflen(all_nums)==0:
return"error"
else:
all_nums=soup.select("div.page>a>b")[0].get_text()
#获取总分页数
all_pages=math.ceil((int(all_nums)/30))
#获取所有分页链接
all_page_url=[]
foriinrange(0,int(all_pages)):
page_url='http://www.we123.com/e/action/ListInfo.php?page='+str(i)+'&classid=45&line=30&tempid=10&orderby=onclick&myorder=0&totalnum='+str(all_nums)
all_page_url.append(page_url)
returnall_page_url
#获取所有详细页面链接
defget_page_urls():
globalPAGE_URL
c_url=CATEGORY_URL.pop()
print('121行:请求链接'+c_url)
PAGE_URL=get_page_url(c_url)#获取每个区域下面的所有分页链接
#获取所有详细页面链接
defget_info_urls():
whileTrue:
globalPAGE_URL#设置全局变量
glock.acquire()#加锁
iflen(PAGE_URL)==0:
glock.release()#解锁
print('131行:CATEGORY_URL为空')
break
else:
p_url=PAGE_URL.pop()
print('135行:请求链接'+p_url)
glock.release()#解锁
glock.acquire()#加锁
html=get_html(p_url)
ifhtml=="error":
print("141行:connecturlerror")
time.sleep(2)
return
soup=BeautifulSoup(html.text,'lxml')
info_urls=soup.select('div.gzhRight>div.gzh_list>ul>li>a')
forxininfo_urls:
i_url=URL+x['href']
ALL_URLS.append(i_url)
print("库存链接共:"+str(len(ALL_URLS)))
glock.release()#解锁
#获取每一页需要的数据
defget_data():
whileTrue:
globalALL_URLS#设置全局变量
glock.acquire()#加锁
print("当前库存:"+str(len(ALL_URLS)))
iflen(ALL_URLS)==0:
glock.release()#解锁
print('159行:ALL_URLS为空')
break
else:
url=ALL_URLS.pop()
print("开始抓取数据:"+url)
glock.release()#解锁
time.sleep(1)#睡眠1秒钟
html=get_html(url)
ifhtml=="error":
print("168行:connecturlerror")
time.sleep(random.randint(2,4))
return
html.encoding='utf-8'#显式地指定网页编码,一般情况可以不用
soup=BeautifulSoup(html.text,'lxml')
#公众号名称
names=soup.select('div.artcleLeft>div.xcxnry>div.xcxtop>div.xcxtop_left>div.gzhtop_logo>h1')
#微信号id
accounts=[]
accounts.append(soup.select('div.artcleLeft>div.xcxnry>div.xcxtop>div.xcxtop_left>div.gzhtop_logo>p')[0])
#微信头像
imgs=soup.select('div.artcleLeft>div.xcxnry>div.xcxtop>div.xcxtop_left>div.gzhtop_logo>img')
#公众号二维码
QR_codes=soup.select('div.artcleLeft>div.xcxnry>div.xcxtop>div.xcxtop_right>img')
#介绍
descs=soup.select('div.artcleLeft>div.xcxnry>div.xcxinfo')
#公众号分类
categorys=[]
category=''
cate=soup.select('div.artcleLeft>div.xcxnry>div.xcxtop>div.xcxtop_left>div.xcx_p>span>a')
ifnotlen(cate)==0:
category=cate[0].get_text()
else:
category='综合'
glock.acquire()#加锁
forname,account,img,QR_code,descinzip(names,accounts,imgs,QR_codes,descs):
data={
'name':name.get_text(),
'category':category,
'account':account.get_text().split(":")[-1],
'img':img['src'],
'QR_code':QR_code['src'],
'desc':desc.get_text()
}
add_data(data,url)
glock.release()#解锁
#添加数据
defadd_data(data,url):
con=MySQLdb.connect('127.0.0.1','root','root','test',charset="utf8",use_unicode=True)
cursor=con.cursor()
#exit()
insert_sql="""
insertignoreintoweixin5(w_name,category,account,img,QR_code,introduce)
VALUES(%s,%s,%s,%s,%s,%s)
"""
print('212行:'+data['name']+'_'+data['account']+'添加成功!-'+url)
try:
cursor.execute(insert_sql,(data['name'],data['category'],data['account'],data['img'],data['QR_code'],str(data['desc'])))
con.commit()
except:
ALL_URLS.insert(0,url)
print("218行:"+URL+'插入失败')
con.rollback()
con.close()
#将时间字符串转化为时间戳
deftime_to(dt):
timeArray=time.strptime(dt,"%Y年%m月%d日")
timestamp=int(time.mktime(timeArray))
returntimestamp
#启动多线程爬取
defmain():
forxinrange(3):
th=threading.Thread(target=get_info_urls)
th.start()
#get_info_urls()
time.sleep(3)
forxinrange(5):
th=threading.Thread(target=get_data)
th.start()
if__name__=='__main__':
#计时
t1=time.time()
#调用函数
get_ip()#获取ip池
get_page_urls()
time.sleep(2)
#get_categoty_url()
main()
print(time.time()-t1)
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。