python实现爬取图书封面
本文实例为大家分享了python实现爬取图书封面的具体代码,供大家参考,具体内容如下
kongfuzi.py
利用更换代理ip,延迟提交数据,设置请求头破解网站的反爬虫机制
importrequests
importrandom
importtime
classDownLoad():
def__init__(self):
self.ip_list=['191.33.179.242:8080','122.72.108.53:80','93.190.142.214:80','189.8.88.125:65301',
'36.66.55.181:8080','170.84.102.5:8080','177.200.72.214:20183','115.229.115.190:9000']
self.user_agent_list=[
'User-Agent:Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50',
'User-Agent:Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50',
'User-Agent:Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11'
]
defget(self,url,proxy=None,timeout=20,num=5):
print("正在请求%s"%url)
UA=random.choice(self.user_agent_list)
headers={'User-Agent':UA}
ifproxy==None:
try:
returnrequests.get(url,headers=headers,timeout=timeout)
except:
ifnum>0:
time.sleep(10)
returnself.get(url,num=num-1)
else:
time.sleep(10)
IP=''.join(random.choice(self.ip_list).strip())
proxy={'http':IP}
returnself.get(url,proxy=proxy,timeout=timeout)
else:
try:
IP=''.join(random.choice(self.ip_list).strip())
proxy={'http':IP}
returnrequests.get(url,headers=headers,proxy=proxy,timeout=timeout)
except:
ifnum>0:
time.sleep(10)
IP=''.join(random.choice(self.ip_list).strip())
proxy={'http':IP}
print("正在更换代理")
print("当前代理%s"%proxy)
returnself.get(url,proxy=proxy,num=num-1)
main.py
将爬取的图片保存到本地,然后展示到界面
importkongfuzi
importos
importrequests
importbs4
fromtkinterimport*
fromPILimportImage,ImageTk
#下载图片,生成图片地址列表和图书信息列表
defdownload():
baseUrl="http://search.kongfz.com"
keyword=e1.get()
url=baseUrl+"/product_result/?select=0&key="+keyword
print("下载链接:"+url)
show(url)
#bs4处理
defchangesoup(html):
htm=html.content
html_doc=str(htm,'utf-8')
soup=bs4.BeautifulSoup(html_doc,"html.parser")
returnsoup
#图书信息集合
defbookinfo(soup):
#图书价格列表
price=[]
soupprice=soup.select(".first-info.f_right.bold")
foriinsoupprice:
price.append(i.string)
#书店名列表
storename=[]
soupstorename=soup.select(".textaspan")
foreachinsoupstorename:
ifeach.string==None:
soupstorename.remove(each)
foriinsoupstorename:
storename.append(i.string)
#商家地区列表
place=[]
soupplace=soup.select(".user-place")
foriinsoupplace:
place.append(i.string)
#书名列表
bookname=[]
bookname1=soup.select(
".search-wrap.search-main.search-main-result.result-content.result-list.item.item-info.title.link")
#print(len(bookname1))
#print(bookname1)
foreachinbookname1:
print(each)
#a=bs4.BeautifulSoup(each,"html.parser")
a=each.get_text()
print(a)
#type(a)
#a=bs4.BeautifulSoup(a,"html.parser")
#b=a.get_text()
bookname.append(a)
#print(bookname)
#print(len(bookname))
returnbookname,price,place,storename
#保存图片
defimgsave(soup):
dirName="image"
os.makedirs(dirName,exist_ok=True)
filePathList=[]
imgUrl=soup.select(".search-main-result.result-content.result-list.item.item-img.img-boximg")
#print(imgUrl)
ifnotimgUrl:
print("没有找到当前节点下图片")
else:
i=0
forimageUrlsinimgUrl:
#找到图片地址获取它
downloadUrl=imageUrls.get('src')
#ifdownloadUrl=="/searchfront/img/error.jpg":
#downloadUrl="http://book.kongfz.com/img/pc/error.jpg"
print("打印要下载的图片地址:",downloadUrl)
#http://book.kongfz.com/img/pc/error.jpg
#分割字符
split=downloadUrl.split("/")
#只保留最后一个元素
fileName=str(i)+"-"+os.path.basename(split[len(split)-1])
print("文件名:"+fileName)
#建立一个新路径
filePath=os.path.join(dirName,fileName)
filePathList.append(filePath)
ifnotos.path.exists(filePath):
imageUrlPath=requests.get(downloadUrl)
#检查当前网络是否请求成功
imageUrlPath.raise_for_status()
#'wb'二进制模式打开img适用
imageFile=open(filePath,'wb')
forimageinimageUrlPath.iter_content(10000):
#把每次遍历的文件图像都存储进文件夹中
imageFile.write(image)
#关闭文件
imageFile.close()
i=i+1
returnfilePathList
#图片展示
defshow(url):
xz=kongfuzi.DownLoad()
html=xz.get(url)
#添加代理ip到ip_list
add_ip=e2.get()
xz.ip_list.append(add_ip)
soup=changesoup(html)
bookname,price,place,storename=bookinfo(soup)
#print(bookname)
#print(price)
#print(place)
#print(storename)
filePathList=imgsave(soup)
root1=Toplevel()
root1.geometry("1720x800")
root1.title("孔网图片爬取")
#处理图片,转换成可以显示
photo=[]
temp=[]
foreachinfilePathList:
temp=Image.open(each)
photo.append(ImageTk.PhotoImage(temp))
canvas=Canvas(root1,width=1700,height=800,scrollregion=(0,0,0,4000))#创建canvas
canvas.place(x=10,y=10)#放置canvas的位置
frame=Frame(canvas)#把frame放在canvas里
frame.place(width=1680,height=800)
foriinrange(50):
#图片行列
rownum=int(i/5)
columnnum=i%5
#photo=ImageTk.PhotoImage(Image.open(filePathList[i]))
imgLabel1=Label(frame,image=photo[i],width=280,height=280)
imgLabel1.grid(row=rownum*5,column=columnnum,padx=10,pady=5)
infoLabel1=Label(frame,text="书名:"+bookname[i],bg="#FFF8DC",justify=LEFT)
infoLabel1.grid(row=rownum*5+1,column=columnnum,padx=45,pady=2,sticky=W)
infoLabel2=Label(frame,text="价格:"+price[i]+"元",bg="#FFF8DC",justify=LEFT)
infoLabel2.grid(row=rownum*5+2,column=columnnum,padx=45,pady=2,sticky=W)
infoLabel3=Label(frame,text="发货地区:"+place[i],bg="#FFF8DC",justify=LEFT)
infoLabel3.grid(row=rownum*5+3,column=columnnum,padx=45,pady=2,sticky=W)
infoLabel4=Label(frame,text="书店:"+storename[i],bg="#FFF8DC",justify=LEFT)
infoLabel4.grid(row=rownum*5+4,column=columnnum,padx=45,pady=2,sticky=W)
vbar=Scrollbar(canvas,orient=VERTICAL)#竖直滚动条
vbar.place(x=1680,width=20,height=800)
vbar.configure(command=canvas.yview)
canvas.config(yscrollcommand=vbar.set)#设置
canvas.create_window((800,2000),window=frame)
mainloop()
if__name__=='__main__':
#界面
root=Tk()
root.title("孔网图片爬取")
e1=Entry(root)
e2=Entry(root)
e1.grid(row=0,column=0,padx=20,pady=20)
e2.grid(row=0,column=2,padx=20,pady=20)
label1=Label(root,text="关键字",width=10).grid(row=0,column=1,padx=10,pady=5)
label2=Label(root,text="添加代理ip",width=10).grid(row=0,column=3,padx=10,pady=5)
btn1=Button(root,text="搜索",width=10,command=download).grid(row=1,column=1,padx=10,pady=5)
#print(e1.get())
mainloop()
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。