对Python3 解析html的几种操作方式小结
解析html是爬虫后的重要的一个处理数据的环节。一下记录解析html的几种方式。
先介绍基础的辅助函数,主要用于获取html并输入解析后的结束
#把传递解析函数,便于下面的修改
defget_html(url,paraser=bs4_paraser):
headers={
'Accept':'*/*',
'Accept-Encoding':'gzip,deflate,sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Host':'www.360kan.com',
'Proxy-Connection':'keep-alive',
'User-Agent':'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/52.0.2743.116Safari/537.36'
}
request=urllib2.Request(url,headers=headers)
response=urllib2.urlopen(request)
response.encoding='utf-8'
ifresponse.code==200:
data=StringIO.StringIO(response.read())
gzipper=gzip.GzipFile(fileobj=data)
data=gzipper.read()
value=paraser(data)#open('E:/h5/haPkY0osd0r5UB.html').read()
returnvalue
else:
pass
value=get_html('http://www.360kan.com/m/haPkY0osd0r5UB.html',paraser=lxml_parser)
forrowinvalue:
printrow
1,lxml.html的方式进行解析,
ThelxmlXMLtoolkitisaPythonicbindingfortheClibrarieslibxml2andlibxslt.ItisuniqueinthatitcombinesthespeedandXMLfeaturecompletenessoftheselibrarieswiththesimplicityofanativePythonAPI,mostlycompatiblebutsuperiortothewell-knownElementTreeAPI.ThelatestreleaseworkswithallCPythonversionsfrom2.6to3.5.Seetheintroductionformoreinformationaboutbackgroundandgoalsofthelxmlproject.SomecommonquestionsareansweredintheFAQ.[官网](http://lxml.de/)
deflxml_parser(page):
data=[]
doc=etree.HTML(page)
all_div=doc.xpath('//div[@class="yingping-list-wrap"]')
forrowinall_div:
#获取每一个影评,即影评的item
all_div_item=row.xpath('.//div[@class="item"]')#find_all('div',attrs={'class':'item'})
forrinall_div_item:
value={}
#获取影评的标题部分
title=r.xpath('.//div[@class="g-cleartitle-wrap"][1]')
value['title']=title[0].xpath('./a/text()')[0]
value['title_href']=title[0].xpath('./a/@href')[0]
score_text=title[0].xpath('./div/span/span/@style')[0]
score_text=re.search(r'\d+',score_text).group()
value['score']=int(score_text)/20
#时间
value['time']=title[0].xpath('./div/span[@class="time"]/text()')[0]
#多少人喜欢
value['people']=int(
re.search(r'\d+',title[0].xpath('./div[@class="num"]/span/text()')[0]).group())
data.append(value)
returndata
2,使用BeautifulSoup,不多说了,大家网上找资料看看
defbs4_paraser(html):
all_value=[]
value={}
soup=BeautifulSoup(html,'html.parser')
#获取影评的部分
all_div=soup.find_all('div',attrs={'class':'yingping-list-wrap'},limit=1)
forrowinall_div:
#获取每一个影评,即影评的item
all_div_item=row.find_all('div',attrs={'class':'item'})
forrinall_div_item:
#获取影评的标题部分
title=r.find_all('div',attrs={'class':'g-cleartitle-wrap'},limit=1)
iftitleisnotNoneandlen(title)>0:
value['title']=title[0].a.string
value['title_href']=title[0].a['href']
score_text=title[0].div.span.span['style']
score_text=re.search(r'\d+',score_text).group()
value['score']=int(score_text)/20
#时间
value['time']=title[0].div.find_all('span',attrs={'class':'time'})[0].string
#多少人喜欢
value['people']=int(
re.search(r'\d+',title[0].find_all('div',attrs={'class':'num'})[0].span.string).group())
#printr
all_value.append(value)
value={}
returnall_value
3,使用SGMLParser,主要是通过start、endtag的方式进行了,解析工程比较明朗,但是有点麻烦,而且该案例的场景不太适合该方法,(哈哈)
classCommentParaser(SGMLParser):
def__init__(self):
SGMLParser.__init__(self)
self.__start_div_yingping=False
self.__start_div_item=False
self.__start_div_gclear=False
self.__start_div_ratingwrap=False
self.__start_div_num=False
#a
self.__start_a=False
#span3中状态
self.__span_state=0
#数据
self.__value={}
self.data=[]
defstart_div(self,attrs):
fork,vinattrs:
ifk=='class'andv=='yingping-list-wrap':
self.__start_div_yingping=True
elifk=='class'andv=='item':
self.__start_div_item=True
elifk=='class'andv=='g-cleartitle-wrap':
self.__start_div_gclear=True
elifk=='class'andv=='rating-wrapg-clear':
self.__start_div_ratingwrap=True
elifk=='class'andv=='num':
self.__start_div_num=True
defend_div(self):
ifself.__start_div_yingping:
ifself.__start_div_item:
ifself.__start_div_gclear:
ifself.__start_div_numorself.__start_div_ratingwrap:
ifself.__start_div_num:
self.__start_div_num=False
ifself.__start_div_ratingwrap:
self.__start_div_ratingwrap=False
else:
self.__start_div_gclear=False
else:
self.data.append(self.__value)
self.__value={}
self.__start_div_item=False
else:
self.__start_div_yingping=False
defstart_a(self,attrs):
ifself.__start_div_yingpingandself.__start_div_itemandself.__start_div_gclear:
self.__start_a=True
fork,vinattrs:
ifk=='href':
self.__value['href']=v
defend_a(self):
ifself.__start_div_yingpingandself.__start_div_itemandself.__start_div_gclearandself.__start_a:
self.__start_a=False
defstart_span(self,attrs):
ifself.__start_div_yingpingandself.__start_div_itemandself.__start_div_gclear:
ifself.__start_div_ratingwrap:
ifself.__span_state!=1:
fork,vinattrs:
ifk=='class'andv=='rating':
self.__span_state=1
elifk=='class'andv=='time':
self.__span_state=2
else:
fork,vinattrs:
ifk=='style':
score_text=re.search(r'\d+',v).group()
self.__value['score']=int(score_text)/20
self.__span_state=3
elifself.__start_div_num:
self.__span_state=4
defend_span(self):
self.__span_state=0
defhandle_data(self,data):
ifself.__start_a:
self.__value['title']=data
elifself.__span_state==2:
self.__value['time']=data
elifself.__span_state==4:
score_text=re.search(r'\d+',data).group()
self.__value['people']=int(score_text)
pass
defsgl_parser(html):
parser=CommentParaser()
parser.feed(html)
returnparser.data
4,HTMLParaer,与3原理相识,就是调用的方法不太一样,基本上可以公用,
classCommentHTMLParser(HTMLParser.HTMLParser):
def__init__(self):
HTMLParser.HTMLParser.__init__(self)
self.__start_div_yingping=False
self.__start_div_item=False
self.__start_div_gclear=False
self.__start_div_ratingwrap=False
self.__start_div_num=False
#a
self.__start_a=False
#span3中状态
self.__span_state=0
#数据
self.__value={}
self.data=[]
defhandle_starttag(self,tag,attrs):
iftag=='div':
fork,vinattrs:
ifk=='class'andv=='yingping-list-wrap':
self.__start_div_yingping=True
elifk=='class'andv=='item':
self.__start_div_item=True
elifk=='class'andv=='g-cleartitle-wrap':
self.__start_div_gclear=True
elifk=='class'andv=='rating-wrapg-clear':
self.__start_div_ratingwrap=True
elifk=='class'andv=='num':
self.__start_div_num=True
eliftag=='a':
ifself.__start_div_yingpingandself.__start_div_itemandself.__start_div_gclear:
self.__start_a=True
fork,vinattrs:
ifk=='href':
self.__value['href']=v
eliftag=='span':
ifself.__start_div_yingpingandself.__start_div_itemandself.__start_div_gclear:
ifself.__start_div_ratingwrap:
ifself.__span_state!=1:
fork,vinattrs:
ifk=='class'andv=='rating':
self.__span_state=1
elifk=='class'andv=='time':
self.__span_state=2
else:
fork,vinattrs:
ifk=='style':
score_text=re.search(r'\d+',v).group()
self.__value['score']=int(score_text)/20
self.__span_state=3
elifself.__start_div_num:
self.__span_state=4
defhandle_endtag(self,tag):
iftag=='div':
ifself.__start_div_yingping:
ifself.__start_div_item:
ifself.__start_div_gclear:
ifself.__start_div_numorself.__start_div_ratingwrap:
ifself.__start_div_num:
self.__start_div_num=False
ifself.__start_div_ratingwrap:
self.__start_div_ratingwrap=False
else:
self.__start_div_gclear=False
else:
self.data.append(self.__value)
self.__value={}
self.__start_div_item=False
else:
self.__start_div_yingping=False
eliftag=='a':
ifself.__start_div_yingpingandself.__start_div_itemandself.__start_div_gclearandself.__start_a:
self.__start_a=False
eliftag=='span':
self.__span_state=0
defhandle_data(self,data):
ifself.__start_a:
self.__value['title']=data
elifself.__span_state==2:
self.__value['time']=data
elifself.__span_state==4:
score_text=re.search(r'\d+',data).group()
self.__value['people']=int(score_text)
pass
defhtml_parser(html):
parser=CommentHTMLParser()
parser.feed(html)
returnparser.data
3,4对于该案例来说确实是不太适合,趁现在有空记录下来,功学习使用!
以上这篇对Python3解析html的几种操作方式小结就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持毛票票。