Python爬虫包BeautifulSoup实例(三)
一步一步构建一个爬虫实例,抓取糗事百科的段子
先不用beautifulsoup包来进行解析
第一步,访问网址并抓取源码
#-*-coding:utf-8-*-
#@Author:HaonanWu
#@Date:2016-12-2216:16:08
#@LastModifiedby:HaonanWu
#@LastModifiedtime:2016-12-2220:17:13
importurllib
importurllib2
importre
importos
if__name__=='__main__':
#访问网址并抓取源码
url='http://www.qiushibaike.com/textnew/page/1/?s=4941357'
user_agent='Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/54.0.2840.99Safari/537.36'
headers={'User-Agent':user_agent}
try:
request=urllib2.Request(url=url,headers=headers)
response=urllib2.urlopen(request)
content=response.read()
excepturllib2.HTTPErrorase:
printe
exit()
excepturllib2.URLErrorase:
printe
exit()
printcontent.decode('utf-8')
第二步,利用正则表达式提取信息
首先先观察源码中,你需要的内容的位置以及如何识别
然后用正则表达式去识别读取
注意正则表达式中的.是不能匹配\n的,所以需要设置一下匹配模式。
#-*-coding:utf-8-*-
#@Author:HaonanWu
#@Date:2016-12-2216:16:08
#@LastModifiedby:HaonanWu
#@LastModifiedtime:2016-12-2220:17:13
importurllib
importurllib2
importre
importos
if__name__=='__main__':
#访问网址并抓取源码
url='http://www.qiushibaike.com/textnew/page/1/?s=4941357'
user_agent='Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/54.0.2840.99Safari/537.36'
headers={'User-Agent':user_agent}
try:
request=urllib2.Request(url=url,headers=headers)
response=urllib2.urlopen(request)
content=response.read()
excepturllib2.HTTPErrorase:
printe
exit()
excepturllib2.URLErrorase:
printe
exit()
regex=re.compile('.*?(.*?).*? 第三步,修正数据并保存到文件中
#-*-coding:utf-8-*-
#@Author:HaonanWu
#@Date:2016-12-2216:16:08
#@LastModifiedby:HaonanWu
#@LastModifiedtime:2016-12-2221:41:32
importurllib
importurllib2
importre
importos
if__name__=='__main__':
#访问网址并抓取源码
url='http://www.qiushibaike.com/textnew/page/1/?s=4941357'
user_agent='Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/54.0.2840.99Safari/537.36'
headers={'User-Agent':user_agent}
try:
request=urllib2.Request(url=url,headers=headers)
response=urllib2.urlopen(request)
content=response.read()
excepturllib2.HTTPErrorase:
printe
exit()
excepturllib2.URLErrorase:
printe
exit()
regex=re.compile('.*?(.*?).*? 换成\n item=item.replace('\n','').replace('
','\n') filepath=path+'/'+str(count)+'.txt' f=open(filepath,'w') f.write(item) f.close() count+=1
第四步,将多个页面下的内容都抓取下来
#-*-coding:utf-8-*-
#@Author:HaonanWu
#@Date:2016-12-2216:16:08
#@LastModifiedby:HaonanWu
#@LastModifiedtime:2016-12-2220:17:13
importurllib
importurllib2
importre
importos
if__name__=='__main__':
#访问网址并抓取源码
path='./qiubai'
ifnotos.path.exists(path):
os.makedirs(path)
user_agent='Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/54.0.2840.99Safari/537.36'
headers={'User-Agent':user_agent}
regex=re.compile('.*?(.*?).*?