Python抓取京东图书评论数据
京东图书评论有非常丰富的信息,这里面就包含了购买日期、书名、作者、好评、中评、差评等等。以购买日期为例,使用Python+Mysql的搭配进行实现,程序不大,才100行。相关的解释我都在程序里加注了:
fromseleniumimportwebdriver
frombs4importBeautifulSoup
importre
importwin32com.client
importthreading,time
importMySQLdb
defmydebug():
driver.quit()
exit(0)
defcatchDate(s):
"""页面数据提取"""
soup=BeautifulSoup(s)
z=[]
globalnowtimes
m=soup.findAll("div",class_="date-buy")
forobjinm:
try:
tmp=obj.find('br').contents
exceptException,e:
continue
if(tmp!=""):
z.append(tmp)
nowtimes+=1
returnz
defgetTimes(n,t):
"""获取当前进度"""
return"当前进度为:"+str(int(100*n/t))+"%"
#———————————————————————————————————|程序开始|—————————————————————————————————
#确定图书大类
cate={"3273":"历史","3279":"心理学","3276":"政治军事","3275":"国学古籍","3274":"哲学宗教","3277":"法律","3280":"文化","3281":"社会科学"}
#断点续抓
num1=input("bookid:")
num2=input("pagenumber:")
#生成图书大类链接,共需17355*20=347100次
totaltimes=347100.0
nowtimes=0
#开启webdirver的PhantomJS对象
#driver=webdriver.PhantomJS()
driver=webdriver.Ie('C:\Python27\Scripts\IEDriverServer')
#driver=webdriver.Chrome('C:\Python27\Scripts\chromedriver')
#读出Mysql中的评论页面,进行抓取
#连接数据库
try:
conn=MySQLdb.connect(host='localhost',user='root',passwd='',db='jd')
exceptException,e:
printe
sys.exit()
#获取cursor对象
cursor=conn.cursor()
sql="SELECT*FROMbooknewORDERBYpagenumberDESC"
cursor.execute(sql)
alldata=cursor.fetchall()
flag=0
flag2=0
#如果有数据返回就循环输出,http://club.jd.com/review/10178500-1-154.html
ifalldata:
forrecinalldata:
#rec[0]--bookid,rec[1]--cateid,rec[2]--pagenumber
if(rec[0]!=str(num1)andflag==0):
continue
else:
flag=1
forpinrange(num2,rec[2]):
if(flag2==0):
num2=0
flag2=1
p+=1
link="http://club.jd.com/review/"+rec[0]+"-1-"+str(p)+".html"
#抓网页
driver.get(link)
html=driver.page_source
#抓评论
buydate=catchDate(html)
#写入数据库
forzinbuydate:
sql="INSERTINTOljj(id,cateid,bookid,date)VALUES(NULL,'"+rec[0]+"','"+rec[1]+"','"+z[0]+"');"
try:
cursor.execute(sql)
exceptException,e:
printe
conn.commit()
printgetTimes(nowtimes,totaltimes)
driver.quit()
cursor.close()
conn.close()