python 爬取微信文章
本人想搞个采集微信文章的网站,无奈实在从微信本生无法找到入口链接,网上翻看了大量的资料,发现大家的做法总体来说大同小异,都是以搜狗为入口。下文是笔者整理的一份python爬取微信文章的代码,有兴趣的欢迎阅读
#coding:utf-8 author='haoning' **#!/usr/bin/envpython importtime importdatetime importrequests** importjson importsys reload(sys) sys.setdefaultencoding("utf-8") importre importxml.etree.ElementTreeasET importos #OPENID='oIWsFtyel13ZMva1qltQ3pfejlwU' OPENID='oIWsFtw_-W2DaHwRz1oGWzL-wF9M&ext' XML_LIST=[] #getcurrenttimeinmilliseconds current_milli_time=lambda:int(round(time.time()*1000)) defget_json(pageIndex): globalOPENID the_headers={ 'User-Agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_9_5)AppleWebKit/537.36(KHTML,likeGecko)Chrome/39.0.2171.95Safari/537.36', 'Referer':'http://weixin.sogou.com/gzh?openid={0}'.format(OPENID), 'Host':'weixin.sogou.com' } url='http://weixin.sogou.com/gzhjs?cb=sogou.weixin.gzhcb&openid={0}&page={1}&t={2}'.format(OPENID,pageIndex,current_milli_time())#url print(url) response=requests.get(url,headers=the_headers) #TO-DO;checkifmatchthereg response_text=response.text printresponse_text json_start=response_text.index('sogou.weixin.gzhcb(')+19 json_end=response_text.index(')')-2 json_str=response_text[json_start:json_end]#getjson #print(json_str) #convertjson_strtojsonobject json_obj=json.loads(json_str)#getjsonobj #printjson_obj['totalPages'] returnjson_obj defadd_xml(jsonObj): globalXML_LIST xmls=jsonObj['items']#getitem #printtype(xmls) XML_LIST.extend(xmls)#用新列表扩展原来的列表 **[#www.oksousou.com][2]** #------------Main---------------- print'playit:)' #gettotalpages default_json_obj=get_json(1) total_pages=0 total_items=0 if(default_json_obj): #addthedefaultxmls add_xml(default_json_obj) #gettherestitems total_pages=default_json_obj['totalPages'] total_items=default_json_obj['totalItems'] printtotal_pages #iterateallpages if(total_pages>=2): forpageIndexinrange(2,total_pages+1): add_xml(get_json(pageIndex))#extend print'loadpage'+str(pageIndex) printlen(XML_LIST)