python爬虫获取淘宝天猫商品详细参数
首先我是从淘宝进去,爬取了按销量排序的所有(100页)女装的列表信息按综合、销量分别爬取淘宝女装列表信息,然后导出前100商品的link,爬取其详细信息。这些商品有淘宝的,也有天猫的,这两个平台有些区别,处理的时候要注意。比如,有的说“面料”、有的说“材质成分”,其实是一个意思,等等。可以取不同的链接做一下测试。
importre
fromcollectionsimportOrderedDict
frombs4importBeautifulSoup
frompyqueryimportPyQueryaspq#获取整个网页的源代码
fromconfigimport*#可引用congif的所有变量
importpymysql
importurllib
importjson
importbs4
importrequests
fromseleniumimportwebdriver
fromselenium.webdriver.support.uiimportWebDriverWait
frompyqueryimportPyQueryaspq#获取整个网页的源代码
importpandasaspd
#测试淘宝+天猫,可完整输出及保存
browser=webdriver.Firefox()
wait=WebDriverWait(browser,10)
#######天猫上半部分详情#############
defget_tianmao_header(url):
browser.get(url)
#wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-itemlist.items.item')))#加载所有宝贝
html=browser.page_source
doc=pq(html)
#print(doc)
info=OrderedDict()#存放该商品所具有的全部信息
items=doc('#page')
#info['店铺名']=items.find('.slogo').find('.slogo-shopname').text()
#info['ID']=items.find('#LineZing').attr['itemid']
info['宝贝']=items.find('.tb-detail-hd').find('h1').text()
info['促销价']=items.find('#J_PromoPrice').find('.tm-promo-price').find('.tm-price').text()
info['原价']=items.find('#J_StrPriceModBox').find('.tm-price').text()
#'月销量':items.find('.tm-ind-panel').find('.tm-ind-itemtm-ind-sellCount').find('.tm-indcon').find('.tm-count').text(),
info['月销量']=items.find('.tm-ind-panel').find('.tm-indcon').find('.tm-count').text().split('',2)[0]
info['累计评价']=items.find('#J_ItemRates').find('.tm-indcon').find('.tm-count').text()
#print(info)
returninfo
########淘宝上半部分详情###############
defget_taobao_header(url):
browser.get(url)
#wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-itemlist.items.item')))#加载所有宝贝
html=browser.page_source
doc=pq(html)
#print(doc)
info=OrderedDict()#存放该商品所具有的全部信息
items=doc('#page')
#info['店铺名']=items.find('.tb-shop-seller').find('.tb-seller-name').text()
#info['ID']=items.find('#J_Pine').attr['data-itemid']
info['宝贝']=items.find('#J_Title').find('h3').text()
info['原价']=items.find('#J_StrPrice').find('.tb-rmb-num').text()
info['促销价']=items.find('#J_PromoPriceNum').text()
#'月销量':items.find('.tm-ind-panel').find('.tm-ind-itemtm-ind-sellCount').find('.tm-indcon').find('.tm-count').text(),
info['月销量']=items.find('#J_SellCounter').text()
info['累计评价']=items.find('#J_RateCounter').text()
#print(info)
returninfo
#######################详情############################
#抓取所有商品详情
defget_Details(attrs,info):
#res=requests.get(url)
#soup=BeautifulSoup(res.text,"html.parser")
#
#attrs=soup.select('.attributes-listli')
#attrs=[厚薄:薄,材质成分:其他100%,]
attrs_name=[]
attrs_value=[]
'''''
[\s]匹配空格,[\s]*,后面有*,则可以为空
*:匹配前面的子表达式任意次
'''
forattrinattrs:
attrs_name.append(re.search(r'(.*?):[\s]*(.*)',attr.text).group(1))
attrs_value.append(re.search(r'(.*?):[\s]*(.*)',attr.text).group(2))
#print('attrs_name=',attrs_name)#attrs_name=['厚薄','材质成分',...]
#print('attrs_value=',attrs_value)#attrs_value=['薄','其他100%',...]
allattrs=OrderedDict()#存放该产品详情页面所具有的属性
forkinrange(0,len(attrs_name)):
allattrs[attrs_name[k]]=attrs_value[k]
#print('allattrs=',allattrs)#allattrs=OrderedDict([('厚薄','薄'),('材质成分','其他100%'),...])
#info=OrderedDict()#存放该商品所具有的全部信息
#info=get_headdetail2(url)
#下面三条语句获取描述、服务、物流的评分信息
#下面的语句用来判断该商品具有哪些属性,如果具有该属性,将属性值插入有序字典,否则,该属性值为空
#适用场景
if'材质成分'inattrs_name:
info['材质成分']=allattrs['材质成分']
elif'面料'inattrs_name:
info['材质成分']=allattrs['面料']
else:
info['材质成分']='NA'
#适用对象
if'流行元素'inattrs_name:
info['流行元素']=allattrs['流行元素']
else:
info['流行元素']='NA'
#季节
if'年份季节'inattrs_name:
info['年份季节']=allattrs['年份季节']
else:
info['年份季节']='NA'
#款式
if'袖长'inattrs_name:
info['袖长']=allattrs['袖长']
else:
info['袖长']='NA'
#尺码
if'销售渠道类型'inattrs_name:
info['销售渠道类型']=allattrs['销售渠道类型']
else:
info['销售渠道类型']='NA'
#帽顶款式
if'货号'inattrs_name:
info['货号']=allattrs['货号']
else:
info['货号']='NA'
#帽檐款式
if'服装版型'inattrs_name:
info['服装版型']=allattrs['服装版型']
else:
info['服装版型']='NA'
#檐形
if'衣长'inattrs_name:
info['衣长']=allattrs['衣长']
else:
info['衣长']='NA'
#主要材质
if'领型'inattrs_name:
info['领型']=allattrs['领型']
else:
info['领型']='NA'
#人群
if'袖型'inattrs_name:
info['袖型']=allattrs['袖型']
else:
info['袖型']='NA'
#品牌
if'品牌'inattrs_name:
info['品牌']=allattrs['品牌']
else:
info['品牌']='NA'
#风格
if'图案'inattrs_name:
info['图案']=allattrs['图案']
elif'中老年女装图案'inattrs_name:
info['图案']=allattrs['中老年女装图案']
else:
info['图案']='NA'
#款式细节
if'服装款式细节'inattrs_name:
info['服装款式细节']=allattrs['服装款式细节']
else:
info['服装款式细节']='NA'
#适用年龄
if'适用年龄'inattrs_name:
info['适用年龄']=allattrs['适用年龄']
else:
info['适用年龄']='NA'
#风格
if'风格'inattrs_name:
info['风格']=allattrs['风格']
elif'中老年风格'inattrs_name:
info['风格']=allattrs['中老年风格']
else:
info['风格']='NA'
#通勤
if'通勤'inattrs_name:
info['通勤']=allattrs['通勤']
else:
info['通勤']='NA'
if'裙长'inattrs_name:
info['裙长']=allattrs['裙长']
else:
info['裙长']='NA'
if'裙型'inattrs_name:
info['裙型']=allattrs['裙型']
else:
info['裙型']='NA'
if'腰型'inattrs_name:
info['腰型']=allattrs['腰型']
else:
info['腰型']='NA'
#颜色分类
if'主要颜色'inattrs_name:
info['主要颜色']=allattrs['主要颜色']
else:
info['主要颜色']='NA'
if'颜色分类'inattrs_name:
info['主要颜色']=allattrs['颜色分类']
else:
info['主要颜色']='NA'
#尺码
if'尺码'inattrs_name:
info['尺码']=allattrs['尺码']
else:
info['尺码']='NA'
if'组合形式'inattrs_name:
info['组合形式']=allattrs['组合形式']
else:
info['组合形式']='NA'
if'裤长'inattrs_name:
info['裤长']=allattrs['裤长']
else:
info['裤长']='NA'
returninfo
importcsv
defmain():
#提取列
withopen('clothes_detai.csv','w',newline='',encoding='utf-8')ascsvfile:
#fieldnames=['店铺ID','店铺名','链接','宝贝','原价','促销价','月销量','累计评价','材质成分','流行元素','袖长','年份季节','销售渠道类型','货号','服装版型','衣长','领型','袖型',
#'裙型','裙长','腰型','裤长','组合形式','品牌','图案','服装款式细节','适用年龄','风格','通勤','主要颜色','尺码']
fieldnames=['Link','Brand','Title','Price','Saleprice','Sales','Evaluations',
'Component','Fashionelements','Sleeve','Seasons','Saleschannels',
'Number','Clothes_Style','Long','Collartype','Sleevetype',
'Skirttype','Skirtlength','Waist','Combiningform','Outseam',
'Design','Fashionpatterndetail','Applicableage',
'Style','Commuter','color','Size']
#'Shop','Data_id','Shop_id','Shop','Link','Data_id',
writer=csv.DictWriter(csvfile,fieldnames=fieldnames)
writer.writeheader()
#urls=['//detail.tmall.com/item.htm?spm=a230r.1.14.1.ebb2eb2eGyUw1&id=549177691667&ns=1&abbucket=4',
#'//item.taobao.com/item.htm?id=548443640333&ns=1&abbucket=0#detail']
f=pd.read_csv('women_clothes_sales2.csv')
urls=f['link'][0:100]
#sh=f['shop_id'][0:3]
#s=f['shop'][0:3]
#forurlinurls:
#print(url)
#writer.writerow({'店铺ID':f['shop_id'],'店铺名':f['shop']})
keys,values=[],[]
#forurlinurls:
foriinurls:
url='http:'+i
#endswith判断字符串是否以指定的字符串结尾
ifurl.endswith('detail'):
info=get_taobao_header(url)
res=requests.get(url)
soup=BeautifulSoup(res.text,"html.parser")
attrs=soup.select('.attributes-listli')#淘宝class
else:
info=get_tianmao_header(url)
res=requests.get(url)
soup=BeautifulSoup(res.text,"html.parser")
attrs=soup.select('#J_AttrULli')#天猫id
#print('attrs=',attrs)
d=get_Details(attrs,info)
print(d)
#forjinf[shop_id]:
#d['店铺ID']=j
#forsinf['shop']:
#d['店铺名']=s
#'Shop':d['店铺名'],'Data_id':d['ID'],
writer.writerow({'Link':url,'Brand':d['品牌'],'Title':d['宝贝'],'Price':d['原价'],'Saleprice':d['促销价'],'Sales':d['月销量'],'Evaluations':d['累计评价'],
'Component':d['材质成分'],'Fashionelements':d['流行元素'],'Sleeve':d['袖长'],'Seasons':d['年份季节'],'Saleschannels':d['销售渠道类型'],
'Number':d['货号'],'Clothes_Style':d['服装版型'],'Long':d['衣长'],'Collartype':d['领型'],'Sleevetype':d['袖型'],
'Skirttype':d['裙型'],'Skirtlength':d['裙长'],'Waist':d['腰型'],'Combiningform':d['组合形式'],'Outseam':d['裤长'],
'Design':d['图案'],'Fashionpatterndetail':d['服装款式细节'],'Applicableage':d['适用年龄'],
'Style':d['风格'],'Commuter':d['通勤'],'color':d['主要颜色'],'Size':d['尺码']})
if__name__=='__main__':
main()
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。