微客导航 » 文章资讯 » Python爬虫包BeautifulSoup实例（三）

Python爬虫包BeautifulSoup实例（三）

2023-09-11 11:20:05 329

一步一步构建一个爬虫实例，抓取糗事百科的段子

先不用beautifulsoup包来进行解析

第一步，访问网址并抓取源码

#-*-coding:utf-8-*-
#@Author:HaonanWu
#@Date:2016-12-2216:16:08
#@LastModifiedby:HaonanWu
#@LastModifiedtime:2016-12-2220:17:13

importurllib
importurllib2
importre
importos

if__name__=='__main__':
#访问网址并抓取源码
url='http://www.qiushibaike.com/textnew/page/1/?s=4941357'
user_agent='Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/54.0.2840.99Safari/537.36'
headers={'User-Agent':user_agent}
try:
request=urllib2.Request(url=url,headers=headers)
response=urllib2.urlopen(request)
content=response.read()
excepturllib2.HTTPErrorase:
printe
exit()
excepturllib2.URLErrorase:
printe
exit()
printcontent.decode('utf-8')

第二步，利用正则表达式提取信息

首先先观察源码中，你需要的内容的位置以及如何识别
然后用正则表达式去识别读取
注意正则表达式中的.是不能匹配\n的，所以需要设置一下匹配模式。

#-*-coding:utf-8-*-
#@Author:HaonanWu
#@Date:2016-12-2216:16:08
#@LastModifiedby:HaonanWu
#@LastModifiedtime:2016-12-2220:17:13

importurllib
importurllib2
importre
importos

if__name__=='__main__':
#访问网址并抓取源码
url='http://www.qiushibaike.com/textnew/page/1/?s=4941357'
user_agent='Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/54.0.2840.99Safari/537.36'
headers={'User-Agent':user_agent}
try:
request=urllib2.Request(url=url,headers=headers)
response=urllib2.urlopen(request)
content=response.read()
excepturllib2.HTTPErrorase:
printe
exit()
excepturllib2.URLErrorase:
printe
exit()

regex=re.compile('.*?(.*?).*?

',re.S) items=re.findall(regex,content) #提取数据 #注意换行符，设置.能够匹配换行符 foriteminitems: printitem

第三步，修正数据并保存到文件中

#-*-coding:utf-8-*-
#@Author:HaonanWu
#@Date:2016-12-2216:16:08
#@LastModifiedby:HaonanWu
#@LastModifiedtime:2016-12-2221:41:32

importurllib
importurllib2
importre
importos

if__name__=='__main__':
#访问网址并抓取源码
url='http://www.qiushibaike.com/textnew/page/1/?s=4941357'
user_agent='Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/54.0.2840.99Safari/537.36'
headers={'User-Agent':user_agent}
try:
request=urllib2.Request(url=url,headers=headers)
response=urllib2.urlopen(request)
content=response.read()
excepturllib2.HTTPErrorase:
printe
exit()
excepturllib2.URLErrorase:
printe
exit()

regex=re.compile('.*?(.*?).*?

',re.S) items=re.findall(regex,content) #提取数据 #注意换行符，设置.能够匹配换行符 path='./qiubai' ifnotos.path.exists(path): os.makedirs(path) count=1 foriteminitems: #整理数据，去掉\n,将
换成\n item=item.replace('\n','').replace('
','\n') filepath=path+'/'+str(count)+'.txt' f=open(filepath,'w') f.write(item) f.close() count+=1

第四步，将多个页面下的内容都抓取下来

#-*-coding:utf-8-*-
#@Author:HaonanWu
#@Date:2016-12-2216:16:08
#@LastModifiedby:HaonanWu
#@LastModifiedtime:2016-12-2220:17:13

importurllib
importurllib2
importre
importos

if__name__=='__main__':
#访问网址并抓取源码
path='./qiubai'
ifnotos.path.exists(path):
os.makedirs(path)
user_agent='Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/54.0.2840.99Safari/537.36'
headers={'User-Agent':user_agent}
regex=re.compile('.*?(.*?).*?

',re.S) count=1 forcntinrange(1,35): print'第'+str(cnt)+'轮' url='http://www.qiushibaike.com/textnew/page/'+str(cnt)+'/?s=4941357' try: request=urllib2.Request(url=url,headers=headers) response=urllib2.urlopen(request) content=response.read() excepturllib2.HTTPErrorase: printe exit() excepturllib2.URLErrorase: printe exit() #printcontent #提取数据 #注意换行符，设置.能够匹配换行符 items=re.findall(regex,content) #保存信息 foriteminitems: #printitem #整理数据，去掉\n,将
换成\n item=item.replace('\n','').replace('
','\n') filepath=path+'/'+str(count)+'.txt' f=open(filepath,'w') f.write(item) f.close() count+=1 print'完成'

使用BeautifulSoup对源码进行解析

#-*-coding:utf-8-*-
#@Author:HaonanWu
#@Date:2016-12-2216:16:08
#@LastModifiedby:HaonanWu
#@LastModifiedtime:2016-12-2221:34:02

importurllib
importurllib2
importre
importos
frombs4importBeautifulSoup

if__name__=='__main__':
url='http://www.qiushibaike.com/textnew/page/1/?s=4941357'
user_agent='Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/54.0.2840.99Safari/537.36'
headers={'User-Agent':user_agent}
request=urllib2.Request(url=url,headers=headers)
response=urllib2.urlopen(request)
#printresponse.read()
soup_packetpage=BeautifulSoup(response,'lxml')
items=soup_packetpage.find_all("div",class_="content")

foriteminitems:
try:
content=item.span.string
exceptAttributeErrorase:
printe
exit()

ifcontent:
printcontent+"\n"

这是用BeautifulSoup去抓取书本以及其价格的代码
可以通过对比得出到bs4对标签的读取以及标签内容的读取
（因为我自己也没有学到这一部分，目前只能依葫芦画瓢地写）

#-*-coding:utf-8-*-
#@Author:HaonanWu
#@Date:2016-12-2220:37:38
#@LastModifiedby:HaonanWu
#@LastModifiedtime:2016-12-2221:27:30
importurllib2
importurllib
importre

frombs4importBeautifulSoup


url="https://www.packtpub.com/all"
try:
html=urllib2.urlopen(url)
excepturllib2.HTTPErrorase:
printe
exit()

soup_packtpage=BeautifulSoup(html,'lxml')
all_book_title=soup_packtpage.find_all("div",class_="book-block-title")

price_regexp=re.compile(u"\s+\$\s\d+\.\d+")

forbook_titleinall_book_title:
try:
print"Book'snameis"+book_title.string.strip()
exceptAttributeErrorase:
printe
exit()
book_price=book_title.find_next(text=price_regexp)
try:
print"Book'spriceis"+book_price.strip()
exceptAttributeErrorase:
printe
exit()
print""

以上全部为本篇文章的全部内容，希望对大家的学习有所帮助，也希望大家多多支持毛票票。

返回顶部
3162201930
czq8825@qq.com

Python爬虫包BeautifulSoup实例（三）

热门推荐

随机推荐