python 爬取微信文章
本人想搞个采集微信文章的网站,无奈实在从微信本生无法找到入口链接,网上翻看了大量的资料,发现大家的做法总体来说大同小异,都是以搜狗为入口。下文是笔者整理的一份python爬取微信文章的代码,有兴趣的欢迎阅读
#coding:utf-8
author='haoning'
**#!/usr/bin/envpython
importtime
importdatetime
importrequests**
importjson
importsys
reload(sys)
sys.setdefaultencoding("utf-8")
importre
importxml.etree.ElementTreeasET
importos
#OPENID='oIWsFtyel13ZMva1qltQ3pfejlwU'
OPENID='oIWsFtw_-W2DaHwRz1oGWzL-wF9M&ext'
XML_LIST=[]
#getcurrenttimeinmilliseconds
current_milli_time=lambda:int(round(time.time()*1000))
defget_json(pageIndex):
globalOPENID
the_headers={
'User-Agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_9_5)AppleWebKit/537.36(KHTML,likeGecko)Chrome/39.0.2171.95Safari/537.36',
'Referer':'http://weixin.sogou.com/gzh?openid={0}'.format(OPENID),
'Host':'weixin.sogou.com'
}
url='http://weixin.sogou.com/gzhjs?cb=sogou.weixin.gzhcb&openid={0}&page={1}&t={2}'.format(OPENID,pageIndex,current_milli_time())#url
print(url)
response=requests.get(url,headers=the_headers)
#TO-DO;checkifmatchthereg
response_text=response.text
printresponse_text
json_start=response_text.index('sogou.weixin.gzhcb(')+19
json_end=response_text.index(')')-2
json_str=response_text[json_start:json_end]#getjson
#print(json_str)
#convertjson_strtojsonobject
json_obj=json.loads(json_str)#getjsonobj
#printjson_obj['totalPages']
returnjson_obj
defadd_xml(jsonObj):
globalXML_LIST
xmls=jsonObj['items']#getitem
#printtype(xmls)
XML_LIST.extend(xmls)#用新列表扩展原来的列表
**[#www.oksousou.com][2]**
#------------Main----------------
print'playit:)'
#gettotalpages
default_json_obj=get_json(1)
total_pages=0
total_items=0
if(default_json_obj):
#addthedefaultxmls
add_xml(default_json_obj)
#gettherestitems
total_pages=default_json_obj['totalPages']
total_items=default_json_obj['totalItems']
printtotal_pages
#iterateallpages
if(total_pages>=2):
forpageIndexinrange(2,total_pages+1):
add_xml(get_json(pageIndex))#extend
print'loadpage'+str(pageIndex)
printlen(XML_LIST)