对Python3 解析html的几种操作方式小结
解析html是爬虫后的重要的一个处理数据的环节。一下记录解析html的几种方式。
先介绍基础的辅助函数,主要用于获取html并输入解析后的结束
#把传递解析函数,便于下面的修改 defget_html(url,paraser=bs4_paraser): headers={ 'Accept':'*/*', 'Accept-Encoding':'gzip,deflate,sdch', 'Accept-Language':'zh-CN,zh;q=0.8', 'Host':'www.360kan.com', 'Proxy-Connection':'keep-alive', 'User-Agent':'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/52.0.2743.116Safari/537.36' } request=urllib2.Request(url,headers=headers) response=urllib2.urlopen(request) response.encoding='utf-8' ifresponse.code==200: data=StringIO.StringIO(response.read()) gzipper=gzip.GzipFile(fileobj=data) data=gzipper.read() value=paraser(data)#open('E:/h5/haPkY0osd0r5UB.html').read() returnvalue else: pass value=get_html('http://www.360kan.com/m/haPkY0osd0r5UB.html',paraser=lxml_parser) forrowinvalue: printrow
1,lxml.html的方式进行解析,
ThelxmlXMLtoolkitisaPythonicbindingfortheClibrarieslibxml2andlibxslt.ItisuniqueinthatitcombinesthespeedandXMLfeaturecompletenessoftheselibrarieswiththesimplicityofanativePythonAPI,mostlycompatiblebutsuperiortothewell-knownElementTreeAPI.ThelatestreleaseworkswithallCPythonversionsfrom2.6to3.5.Seetheintroductionformoreinformationaboutbackgroundandgoalsofthelxmlproject.SomecommonquestionsareansweredintheFAQ.[官网](http://lxml.de/)
deflxml_parser(page): data=[] doc=etree.HTML(page) all_div=doc.xpath('//div[@class="yingping-list-wrap"]') forrowinall_div: #获取每一个影评,即影评的item all_div_item=row.xpath('.//div[@class="item"]')#find_all('div',attrs={'class':'item'}) forrinall_div_item: value={} #获取影评的标题部分 title=r.xpath('.//div[@class="g-cleartitle-wrap"][1]') value['title']=title[0].xpath('./a/text()')[0] value['title_href']=title[0].xpath('./a/@href')[0] score_text=title[0].xpath('./div/span/span/@style')[0] score_text=re.search(r'\d+',score_text).group() value['score']=int(score_text)/20 #时间 value['time']=title[0].xpath('./div/span[@class="time"]/text()')[0] #多少人喜欢 value['people']=int( re.search(r'\d+',title[0].xpath('./div[@class="num"]/span/text()')[0]).group()) data.append(value) returndata
2,使用BeautifulSoup,不多说了,大家网上找资料看看
defbs4_paraser(html): all_value=[] value={} soup=BeautifulSoup(html,'html.parser') #获取影评的部分 all_div=soup.find_all('div',attrs={'class':'yingping-list-wrap'},limit=1) forrowinall_div: #获取每一个影评,即影评的item all_div_item=row.find_all('div',attrs={'class':'item'}) forrinall_div_item: #获取影评的标题部分 title=r.find_all('div',attrs={'class':'g-cleartitle-wrap'},limit=1) iftitleisnotNoneandlen(title)>0: value['title']=title[0].a.string value['title_href']=title[0].a['href'] score_text=title[0].div.span.span['style'] score_text=re.search(r'\d+',score_text).group() value['score']=int(score_text)/20 #时间 value['time']=title[0].div.find_all('span',attrs={'class':'time'})[0].string #多少人喜欢 value['people']=int( re.search(r'\d+',title[0].find_all('div',attrs={'class':'num'})[0].span.string).group()) #printr all_value.append(value) value={} returnall_value
3,使用SGMLParser,主要是通过start、endtag的方式进行了,解析工程比较明朗,但是有点麻烦,而且该案例的场景不太适合该方法,(哈哈)
classCommentParaser(SGMLParser): def__init__(self): SGMLParser.__init__(self) self.__start_div_yingping=False self.__start_div_item=False self.__start_div_gclear=False self.__start_div_ratingwrap=False self.__start_div_num=False #a self.__start_a=False #span3中状态 self.__span_state=0 #数据 self.__value={} self.data=[] defstart_div(self,attrs): fork,vinattrs: ifk=='class'andv=='yingping-list-wrap': self.__start_div_yingping=True elifk=='class'andv=='item': self.__start_div_item=True elifk=='class'andv=='g-cleartitle-wrap': self.__start_div_gclear=True elifk=='class'andv=='rating-wrapg-clear': self.__start_div_ratingwrap=True elifk=='class'andv=='num': self.__start_div_num=True defend_div(self): ifself.__start_div_yingping: ifself.__start_div_item: ifself.__start_div_gclear: ifself.__start_div_numorself.__start_div_ratingwrap: ifself.__start_div_num: self.__start_div_num=False ifself.__start_div_ratingwrap: self.__start_div_ratingwrap=False else: self.__start_div_gclear=False else: self.data.append(self.__value) self.__value={} self.__start_div_item=False else: self.__start_div_yingping=False defstart_a(self,attrs): ifself.__start_div_yingpingandself.__start_div_itemandself.__start_div_gclear: self.__start_a=True fork,vinattrs: ifk=='href': self.__value['href']=v defend_a(self): ifself.__start_div_yingpingandself.__start_div_itemandself.__start_div_gclearandself.__start_a: self.__start_a=False defstart_span(self,attrs): ifself.__start_div_yingpingandself.__start_div_itemandself.__start_div_gclear: ifself.__start_div_ratingwrap: ifself.__span_state!=1: fork,vinattrs: ifk=='class'andv=='rating': self.__span_state=1 elifk=='class'andv=='time': self.__span_state=2 else: fork,vinattrs: ifk=='style': score_text=re.search(r'\d+',v).group() self.__value['score']=int(score_text)/20 self.__span_state=3 elifself.__start_div_num: self.__span_state=4 defend_span(self): self.__span_state=0 defhandle_data(self,data): ifself.__start_a: self.__value['title']=data elifself.__span_state==2: self.__value['time']=data elifself.__span_state==4: score_text=re.search(r'\d+',data).group() self.__value['people']=int(score_text) pass defsgl_parser(html): parser=CommentParaser() parser.feed(html) returnparser.data
4,HTMLParaer,与3原理相识,就是调用的方法不太一样,基本上可以公用,
classCommentHTMLParser(HTMLParser.HTMLParser): def__init__(self): HTMLParser.HTMLParser.__init__(self) self.__start_div_yingping=False self.__start_div_item=False self.__start_div_gclear=False self.__start_div_ratingwrap=False self.__start_div_num=False #a self.__start_a=False #span3中状态 self.__span_state=0 #数据 self.__value={} self.data=[] defhandle_starttag(self,tag,attrs): iftag=='div': fork,vinattrs: ifk=='class'andv=='yingping-list-wrap': self.__start_div_yingping=True elifk=='class'andv=='item': self.__start_div_item=True elifk=='class'andv=='g-cleartitle-wrap': self.__start_div_gclear=True elifk=='class'andv=='rating-wrapg-clear': self.__start_div_ratingwrap=True elifk=='class'andv=='num': self.__start_div_num=True eliftag=='a': ifself.__start_div_yingpingandself.__start_div_itemandself.__start_div_gclear: self.__start_a=True fork,vinattrs: ifk=='href': self.__value['href']=v eliftag=='span': ifself.__start_div_yingpingandself.__start_div_itemandself.__start_div_gclear: ifself.__start_div_ratingwrap: ifself.__span_state!=1: fork,vinattrs: ifk=='class'andv=='rating': self.__span_state=1 elifk=='class'andv=='time': self.__span_state=2 else: fork,vinattrs: ifk=='style': score_text=re.search(r'\d+',v).group() self.__value['score']=int(score_text)/20 self.__span_state=3 elifself.__start_div_num: self.__span_state=4 defhandle_endtag(self,tag): iftag=='div': ifself.__start_div_yingping: ifself.__start_div_item: ifself.__start_div_gclear: ifself.__start_div_numorself.__start_div_ratingwrap: ifself.__start_div_num: self.__start_div_num=False ifself.__start_div_ratingwrap: self.__start_div_ratingwrap=False else: self.__start_div_gclear=False else: self.data.append(self.__value) self.__value={} self.__start_div_item=False else: self.__start_div_yingping=False eliftag=='a': ifself.__start_div_yingpingandself.__start_div_itemandself.__start_div_gclearandself.__start_a: self.__start_a=False eliftag=='span': self.__span_state=0 defhandle_data(self,data): ifself.__start_a: self.__value['title']=data elifself.__span_state==2: self.__value['time']=data elifself.__span_state==4: score_text=re.search(r'\d+',data).group() self.__value['people']=int(score_text) pass defhtml_parser(html): parser=CommentHTMLParser() parser.feed(html) returnparser.data
3,4对于该案例来说确实是不太适合,趁现在有空记录下来,功学习使用!
以上这篇对Python3解析html的几种操作方式小结就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持毛票票。