Python使用Selenium模块模拟浏览器抓取斗鱼直播间信息示例
本文实例讲述了Python使用Selenium模块模拟浏览器抓取斗鱼直播间信息。分享给大家供大家参考,具体如下:
importtime
frommultiprocessingimportPool
fromseleniumimportwebdriver
fromselenium.webdriver.common.byimportBy
fromselenium.webdriver.support.uiimportWebDriverWait
fromselenium.webdriver.supportimportexpected_conditionsasEC
fromselenium.common.exceptionsimportTimeoutException
frombs4importBeautifulSoup
frompymongoimportMongoClient
frompymongo.errorsimportPyMongoError
#monogdb配置信息
MONGO_HOST="localhost"
MONGO_DATABASE="douyu"
MONGO_TABLE="zhibo"
client=MongoClient(host=MONGO_HOST)
db=client[MONGO_DATABASE]
#PhantomJS命令行相关配置
#参见http://phantomjs.org/api/command-line.html
SERVICE_ARGS=['--disk-cache=true','--load-images=false']
#driver=webdriver.Chrome()#有界面
driver=webdriver.PhantomJS(service_args=SERVICE_ARGS)#无界面
delay=10
wait=WebDriverWait(driver,delay)
driver.maximize_window()
defget_total_pages():
url='https://www.douyu.com/directory/all'
driver.get(url)
pages=int(driver.find_element_by_css_selector(
'.shark-pager-dot+.shark-pager-item').text)
print("正在获取第1页数据")
room_list=get_rooms_by_beautifulsoup()
save_to_monogodb(room_list)
returnpages
#根据页码获取指定页数据,并将其保存到数据库中
defparse_page(page_num):
print("正在获取第%d页数据"%page_num)
try:
page_num_box=wait.until(
EC.presence_of_element_located(
(By.CSS_SELECTOR,"input.jumptxt")))
go_btn=wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR,'a.shark-pager-submit')))
page_num_box.clear()
page_num_box.send_keys(page_num)
go_btn.click()
#driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
#time.sleep(0.1)
wait.until(
EC.text_to_be_present_in_element(
(By.CSS_SELECTOR,
'.shark-pager-item.current'),
str(page_num)))
#对于By.CLASS_NAMEinvalidselector:Compoundclassnamesnotpermitted
room_list=get_rooms_by_beautifulsoup()
save_to_monogodb(room_list)
exceptTimeoutException:
print("请求第%d页失败"%page_num)
print("尝试重新获取第%d页"%page_num)
returnparse_page(page_num)
#通过bs4解析数据
defget_rooms_by_beautifulsoup():
'''
通过bs4库解析数据
获取直播间的名称,观看人数,标签,主播名
'''
wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR,"ul#live-list-contentbox>li")))
html=driver.page_source
soup=BeautifulSoup(html,'lxml')
rooms=soup.select('ul#live-list-contentbox>li')
forroominrooms:
room_name=room.find(
'h3',attrs={
'class':'ellipsis'}).get_text(
strip=True)
view_count=room.find('span',class_='dy-numfr').text
tag=room.find('span',class_='tagellipsis').text
hostname=room.find('span',class_='dy-nameellipsisfl').text
#print("房间名:"+room_name+"\t观看人数:"+view_count+"\t标签:"+tag+"\t主播名:"+hostname)
yield{
'room_name':room_name,
'view_count':view_count,
'tag':tag,
'hostname':hostname,
}
defsave_to_monogodb(room_list):
forroominroom_list:
try:
db[MONGO_TABLE].insert(room)#insert支持插入多条数据
print("mongodb插入数据成功:",room)
exceptPyMongoErrorase:
print("mongodb插入数据失败:",room,e)
if__name__=='__main__':
try:
total_pages=get_total_pages()
forpage_numinrange(2,total_pages+1):
parse_page(page_num)
exceptExceptionase:
print("出错了",e)
finally:#确保浏览器能正常关闭
print("共有%d页"%total_pages)
driver.close()
更多关于Python相关内容可查看本站专题:《PythonSocket编程技巧总结》、《Python正则表达式用法总结》、《Python数据结构与算法教程》、《Python函数使用技巧总结》、《Python字符串操作技巧汇总》、《Python入门与进阶经典教程》及《Python文件与目录操作技巧汇总》
希望本文所述对大家Python程序设计有所帮助。