基于python实现的抓取腾讯视频所有电影的爬虫
我搜集了国内10几个电影网站的数据,里面近几十W条记录,用文本没法存,mongodb学习成本非常低,安装、下载、运行起来不会花你5分钟时间。
#-*-coding:utf-8-*-
#byawakenjoys.mysite:www.dianying.at
importre
importurllib2
frombs4importBeautifulSoup
importstring,time
importpymongo
NUM=0#全局变量,电影数量
m_type=u''#全局变量,电影类型
m_site=u'qq'#全局变量,电影网站
#根据指定的URL获取网页内容
defgethtml(url):
req=urllib2.Request(url)
response=urllib2.urlopen(req)
html=response.read()
returnhtml
#从电影分类列表页面获取电影分类
defgettags(html):
globalm_type
soup=BeautifulSoup(html)#过滤出分类内容
#printsoup
#<ulclass="clearfix_group"gname="mi_type"gtype="1">
tags_all=soup.find_all('ul',{'class':'clearfix_group','gname':'mi_type'})
#printlen(tags_all),tags_all
#printstr(tags_all[1]).replace('\n','')
#<a_hot="tag.sub"class="_gtag_hotkey"href="http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html"title="动作"tvalue="0">动作</a>
re_tags=r'<a_hot=\"tag\.sub\"class=\"_gtag_hotkey\"href=\"(.+?)\"title=\"(.+?)\"tvalue=\"(.+?)\">.+?</a>'
p=re.compile(re_tags,re.DOTALL)
tags=p.findall(str(tags_all[0]))
iftags:
tags_url={}
#printtags
fortagintags:
tag_url=tag[0].decode('utf-8')
#printtag_url
m_type=tag[1].decode('utf-8')
tags_url[m_type]=tag_url
else:
print"NotFind"
returntags_url
#获取每个分类的页数
defget_pages(tag_url):
tag_html=gethtml(tag_url)
#divclass="paginator
soup=BeautifulSoup(tag_html)#过滤出标记页面的html
#printsoup
#<divclass="mod_pagenav"id="pager">
div_page=soup.find_all('div',{'class':'mod_pagenav','id':'pager'})
#printdiv_page#len(div_page),div_page[0]
#<aclass="c_txt6"href="http://v.qq.com/list/1_2_-1_-1_1_0_24_20_0_-1_0.html"title="25"><span>25</span></a>
re_pages=r'<aclass=.+?><span>(.+?)</span></a>'
p=re.compile(re_pages,re.DOTALL)
pages=p.findall(str(div_page[0]))
#printpages
iflen(pages)>1:
returnpages[-2]
else:
return1
defgetmovielist(html):
soup=BeautifulSoup(html)
#<ulclass="mod_list_pic_130">
divs=soup.find_all('ul',{'class':'mod_list_pic_130'})
#printdivs
fordiv_htmlindivs:
div_html=str(div_html).replace('\n','')
#printdiv_html
getmovie(div_html)
defgetmovie(html):
globalNUM
globalm_type
globalm_site
#<h6class="caption"><ahref="http://www.tudou.com/albumcover/Z7eF_40EL4I.html"target="_blank"title="徒步旅行队">徒步旅行队</a></h6><ulclass="info"><liclass="desc">法国卖座喜剧片</li><liclass="cast"></li></ul></div><divclass="extext_last"><divclass="ext_txt"><h3class="ext_title">徒步旅行队</h3><divclass="ext_info"><spanclass="ext_area">地区:法国</span><spanclass="ext_cast">导演:</span><spanclass="ext_date">年代:2009</span><spanclass="ext_type">类型:喜剧</span></div><pclass="ext_intro">理查德·达奇拥有一家小的旅游公司,主要经营法国游客到非洲大草原的旅游服务。六个法国游客决定参加理查德·达奇组织的到非洲的一...</p>
re_movie=r'<li><aclass=\"mod_poster_130\"href=\"(.+?)\"target=\"_blank\"title=\"(.+?)\"><img.+?</li>'
p=re.compile(re_movie,re.DOTALL)
movies=p.findall(html)
ifmovies:
conn=pymongo.Connection('localhost',27017)
movie_db=conn.dianying
playlinks=movie_db.playlinks
#printmovies
formovieinmovies:
#printmovie
NUM+=1
print"%s:%d"%("="*70,NUM)
values=dict(
movie_title=movie[1],
movie_url=movie[0],
movie_site=m_site,
movie_type=m_type
)
printvalues
playlinks.insert(values)
print"_"*70
NUM+=1
print"%s:%d"%("="*70,NUM)
#else:
#print"NotFind"
defgetmovieinfo(url):
html=gethtml(url)
soup=BeautifulSoup(html)
#packpack_albumalbum_cover
divs=soup.find_all('div',{'class':'packpack_albumalbum_cover'})
#printdivs[0]
#<ahref="http://www.tudou.com/albumplay/9NyofXc_lHI/32JqhiKJykI.html"target="new"title="《血滴子》独家纪录片"wl="1"></a>
re_info=r'<ahref=\"(.+?)\"target=\"new\"title=\"(.+?)\"wl=\".+?\"></a>'
p_info=re.compile(re_info,re.DOTALL)
m_info=p_info.findall(str(divs[0]))
ifm_info:
returnm_info
else:
print"Notfindmovieinfo"
returnm_info
definsertdb(movieinfo):
globalconn
movie_db=conn.dianying_at
movies=movie_db.movies
movies.insert(movieinfo)
if__name__=="__main__":
globalconn
tags_url="http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html"
#printtags_url
tags_html=gethtml(tags_url)
#printtags_html
tag_urls=gettags(tags_html)
#printtag_urls
forurlintag_urls.items():
printstr(url[1]).encode('utf-8')#,url[0]
maxpage=int(get_pages(str(url[1]).encode('utf-8')))
printmaxpage
forxinrange(0,maxpage):
#http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html
m_url=str(url[1]).replace('0_20_0_-1_0.html','')
movie_url="%s%d_20_0_-1_0.html"%(m_url,x)
printmovie_url
movie_html=gethtml(movie_url.encode('utf-8'))
#printmovie_html
getmovielist(movie_html)
time.sleep(0.1)