python爬虫实现获取下一页代码
我们首先来看下实例代码:
fromtimeimportsleep
importfaker
importrequests
fromlxmlimportetree
fake=faker.Faker()
base_url="http://angelimg.spbeen.com"
defget_next_link(url):
content=downloadHtml(url)
html=etree.HTML(content)
next_url=html.xpath("//a[@class='chnext']/@href")
ifnext_url:
returnbase_url+next_url[0]
else:
returnFalse
defdownloadHtml(ur):
user_agent=fake.user_agent()
headers={'User-Agent':user_agent,"Referer":"http://angelimg.spbeen.com/"}
response=requests.get(url,headers=headers)
returnresponse.text
defgetImgUrl(content):
html=etree.HTML(content)
img_url=html.xpath('//*[@id="content"]/a/img/@src')
title=html.xpath(".//div['@class=article']/h2/text()")
returnimg_url[0],title[0]
defsaveImg(title,img_url):
ifimg_urlisnotNoneandtitleisnotNone:
withopen("txt/"+str(title)+".jpg",'wb')asf:
user_agent=fake.user_agent()
headers={'User-Agent':user_agent,"Referer":"http://angelimg.spbeen.com/"}
content=requests.get(img_url,headers=headers)
#request_view(content)
f.write(content.content)
f.close()
defrequest_view(response):
importwebbrowser
request_url=response.url
base_url=''%(request_url)
base_url=base_url.encode()
content=response.content.replace(b"",base_url)
tem_html=open('tmp.html','wb')
tem_html.write(content)
tem_html.close()
webbrowser.open_new_tab('tmp.html')
defcrawl_img(url):
content=downloadHtml(url)
res=getImgUrl(content)
title=res[1]
img_url=res[0]
saveImg(title,img_url)
if__name__=="__main__":
url="http://angelimg.spbeen.com/ang/4968/1"
whileurl:
print(url)
crawl_img(url)
url=get_next_link(url)
python爬虫如何执行自动下一页循环加载文字
frombs4importBeautifulSoup
importrequests
importtime
fromlxmlimportetree
importos
#该demo执行的为如何利用bs去爬一些文字
defstart():
#发起网络请求
html=requests.get('http://www.baidu.com')
#编码
html.encoding=html.apparent_encoding
#创建sp
soup=BeautifulSoup(html.text,'html.parser')
print(type(soup))
print('打印元素')
print(soup.prettify())
#存储一下title该方法没有提示直接展示
title=soup.head.title.string
print(title)
#写入文本
withopen(r'C:/Users/a/Desktop/a.txt','w')asf:
f.write(title)
print(time.localtime())
url_2='http://news.gdzjdaily.com.cn/zjxw/politics/sz_4.shtml'
defget_html_from_bs4(url):
#response=requests.get(url,headers=data,proxies=ip).content.decode('utf-8')
response=requests.get(url).content.decode('utf-8')
soup=BeautifulSoup(response,'html.parser')
next_page=soup.select('#displaypagenuma:nth-of-type(9)')[0].get('href')
#foriinnett
print(next_page)
next2='http://news.gdzjdaily.com.cn/zjxw/politics/'+next_page
defget_html_from_etree(url):
response=requests.get(url).content.decode('utf-8')
html=etree.HTML(response)
next_page=html.xpath('.//a[@class="PageNum"][8]/@href')[0]
print(next_page)
#next2='http://news.gdzjdaily.com.cn/zjxw/politics/'+next_page
get_html_from_etree(url_2)
if__name__=='__main__':
start()
到此这篇关于python爬虫实现获取下一页代码的文章就介绍到这了,更多相关python爬虫获取下一页内容请搜索毛票票以前的文章或继续浏览下面的相关文章希望大家以后多多支持毛票票!
声明:本文内容来源于网络,版权归原作者所有,内容由互联网用户自发贡献自行上传,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任。如果您发现有涉嫌版权的内容,欢迎发送邮件至:czq8825#qq.com(发邮件时,请将#更换为@)进行举报,并提供相关证据,一经查实,本站将立刻删除涉嫌侵权内容。