Python爬虫包BeautifulSoup学习实例(五)
本文为大家分享了Python爬虫包BeautifulSoup学习实例,具体内容如下
BeautifulSoup
使用BeautifulSoup抓取豆瓣电影的一些信息。
#-*-coding:utf-8-*-
#@Author:HaonanWu
#@Date:2016-12-2416:18:01
#@LastModifiedby:HaonanWu
#@LastModifiedtime:2016-12-2417:25:33
importurllib2
importjson
frombs4importBeautifulSoup
defnowplaying_movies(url):
user_agent='Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/54.0.2840.99Safari/537.36'
headers={'User-Agent':user_agent}
request=urllib2.Request(url=url,headers=headers)
response=urllib2.urlopen(request)
soup_packetpage=BeautifulSoup(response,'lxml')
items=soup_packetpage.findAll("li",class_="list-item")
#items=soup_packetpage.findAll("li",{"class":"list-item"})等价写法
movies=[]
foriteminitems:
ifitem.attrs['data-category']=='nowplaying':
movie={}
movie['title']=item.attrs['data-title']
movie['score']=item.attrs['data-score']
movie['director']=item.attrs['data-director']
movie['actors']=item.attrs['data-actors']
movies.append(movie)
print('%(title)s|%(score)s|%(director)s|%(actors)s'%movie)
returnmovies
if__name__=='__main__':
url='https://movie.douban.com/nowplaying/beijing/'
movies=nowplaying_movies(url)
print('%s'%json.dumps(movies,sort_keys=True,indent=4,separators=(',',':')))
HTMLParser
使用HTMLParser实现上述功能
这里有一些HTMLParser的基础教程
由于HtmlParser自2006年以后就再没更新,目前很多人推荐使用jsoup代替它。
#-*-coding:utf-8-*-
#@Author:HaonanWu
#@Date:2016-12-2415:57:54
#@LastModifiedby:HaonanWu
#@LastModifiedtime:2016-12-2417:03:27
fromHTMLParserimportHTMLParser
importurllib2
importjson
classMovieParser(HTMLParser):
def__init__(self):
HTMLParser.__init__(self)
self.movies=[]
defhandle_starttag(self,tag,attrs):
def_attr(attrlist,attrname):
forattrinattrlist:
ifattr[0]==attrname:
returnattr[1]
returnNone
iftag=='li'and_attr(attrs,'data-title')and_attr(attrs,'data-category')=='nowplaying':
movie={}
movie['title']=_attr(attrs,'data-title')
movie['score']=_attr(attrs,'data-score')
movie['director']=_attr(attrs,'data-director')
movie['actors']=_attr(attrs,'data-actors')
self.movies.append(movie)
print('%(title)s|%(score)s|%(director)s|%(actors)s'%movie)
defnowplaying_movies(url):
headers={'User-Agent':'Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/54.0.2840.99Safari/537.36'}
req=urllib2.Request(url,headers=headers)
s=urllib2.urlopen(req)
parser=MovieParser()
parser.feed(s.read())
s.close()
returnparser.movies
if__name__=='__main__':
url='https://movie.douban.com/nowplaying/beijing/'
movies=nowplaying_movies(url)
print('%s'%json.dumps(movies,sort_keys=True,indent=4,separators=(',',':')))
以上全部为本篇文章的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。