python采集百度百科的方法

2024-03-30 23:25:03 402

#!/usr/bin/python #-*-coding:utf-8-*- #encoding=utf-8 #Filename:get_baike.py importurllib2,re importsys defgetHtml(url,time=10): response=urllib2.urlopen(url,timeout=time) html=response.read() response.close() returnhtml defclearBlank(html): iflen(html)==0:return'' html=re.sub('\r|\n|\t','',html) whilehtml.find("")!=-1orhtml.find(' ')!=-1: html=html.replace(' ','').replace('','') returnhtml if__name__=='__main__': html=getHtml('http://baike.baidu.com/view/4617031.htm',10) html=html.decode('gb2312','replace').encode('utf-8')#转码 title_reg=r'<h1class="title"id="[\d]+">(.*?)</h1>' content_reg=r'<divclass="card-summary-content">(.*?)</p>' title=re.compile(title_reg).findall(html) content=re.compile(content_reg).findall(html) title[0]=re.sub(r'<[^>]*?>','',title[0]) content[0]=re.sub(r'<[^>]*?>','',content[0]) printtitle[0] print'#######################' printcontent[0]

python采集百度百科的方法

热门推荐

随机推荐