Python使用scrapy爬取阳光热线问政平台过程解析
目的:爬取阳光热线问政平台问题反映每个帖子里面的标题、内容、编号和帖子url
CrawlSpider版流程如下:
创建爬虫项目dongguang
scrapystartprojectdongguang
设置items.py文件
#-*-coding:utf-8-*- importscrapy classNewdongguanItem(scrapy.Item): #definethefieldsforyouritemherelike: #name=scrapy.Field() #pass #每页的帖子链接 url=scrapy.Field() #帖子标题 title=scrapy.Field() #帖子编号 number=scrapy.Field() #帖子内容 content=scrapy.Field()
在spiders目录里面,创建并编写爬虫文件sun.py
#-*-coding:utf-8-*-
importscrapy
fromscrapy.linkextractorsimportLinkExtractor
fromscrapy.spidersimportCrawlSpider,Rule
fromdongguan.itemsimportDongguanItem
classSunSpider(CrawlSpider):
name='dg'
allowed_domains=['wz.sun0769.com']
start_urls=['http://wz.sun0769.com/html/top/report.shtml']
#rules是Rule的集合,每个rule规则同时执行。另外,如果发现web服务器有反爬虫机制如返回一个假的url,则可以使用Rule里面的参数process_links调用一个自编函数来处理url后返回一个真的url
rules=(
#每个url都有一个独一无二的指纹,每个爬虫项目都有一个去重队列
#Rule里面没有回调函数,则默认对匹配的链接要跟进,就是对匹配的链接在进行请求获取响应后对响应里面匹配的链接继续跟进,只不过没有回调函数对响应数据进行处理
#Rule(LinkExtractor(allow="page="))如果设置为follow=False,则不会跟进,只显示当前页面匹配的链接。如设置为follow=True,则会对每个匹配的链接发送请求获取响应进而从每个响应里面再次匹配跟进,直至没有。python递归深度默认为不超过1000,否则会报异常
Rule(LinkExtractor(allow="page=")),
Rule(LinkExtractor(allow='http://wz.sun0769.com/html/question/\d+/\d+.shtml'),callback='parse_item')
)
defparse_item(self,response):
print(response.url)
item=DongguanItem()
item['url']=response.url
item['title']=response.xpath('//div[@class="pagecenterp3"]//strong/text()').extract()[0]
item['number']=response.xpath('//div[@class="pagecenterp3"]//strong/text()').extract()[0].split('')[-1].split(':')[-1]
#对帖子里面有图片的处理,发现没有图片时则没有class="contentext"的div标签,以此作为标准获取帖子内容
iflen(response.xpath('//div[@class="contentext"]'))==0:
item['content']=''.join(response.xpath('//div[@class="c1text14_2"]/text()').extract())
else:
item['content']=''.join(response.xpath('//div[@class="contentext"]/text()').extract())
yielditem
编写管道pipelines.py文件
#-*-coding:utf-8-*-
importjson
classDongguanPipeline(object):
def__init__(self):
self.file=open('dongguan.json','w')
defprocess_item(self,item,spider):
content=json.dumps(dict(item),ensure_ascii=False).encode('utf-8')+'\n'
self.file.write(content)
returnitem
defclosespider(self):
self.file.close()
编写settings.py文件
#-*-coding:utf-8-*-
BOT_NAME='dongguan'
SPIDER_MODULES=['dongguan.spiders']
NEWSPIDER_MODULE='dongguan.spiders'
#log日志文件默认保存在当前目录,下面为日志级别,当大于或等于INFO时将被保存
LOG_FILE='dongguan.log'
LOG_LEVEL='INFO'
#爬取深度设置
#DEPTH_LIMIT=1
#Crawlresponsiblybyidentifyingyourself(andyourwebsite)ontheuser-agent
#USER_AGENT='dongguan(+http://www.yourdomain.com)'
#Obeyrobots.txtrules
#ROBOTSTXT_OBEY=True
#ConfiguremaximumconcurrentrequestsperformedbyScrapy(default:16)
#CONCURRENT_REQUESTS=32
#Configureitempipelines
#Seehttps://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES={
'dongguan.pipelines.DongguanPipeline':300,
}
测试运行爬虫,终端执行命令(只要在项目目录内即可)
scrapycrawldg
Spider版流程如下:
创建爬虫项目newdongguang
scrapystartprojectnewdongguan
设置items.py文件
#-*-coding:utf-8-*- importscrapy classNewdongguanItem(scrapy.Item): #每页的帖子链接 url=scrapy.Field() #帖子标题 title=scrapy.Field() #帖子编号 number=scrapy.Field() #帖子内容 content=scrapy.Field()
在spiders目录里面,创建并编写爬虫文件newsun.py
#-*-coding:utf-8-*-
importscrapy
fromnewdongguan.itemsimportNewdongguanItem
classNewsunSpider(scrapy.Spider):
name='ndg'
#设置爬取的域名范围,可写可不写,不写则表示爬取时候不限域名,结果有可能会导致爬虫失控。
allowed_domains=['wz.sun0769.com']
offset=0
url='http://wz.sun0769.com/index.php/question/report?page='+str(offset)
start_urls=[url]
defparse(self,response):
link_list=response.xpath("//a[@class='news14']/@href").extract()
foreachinlink_list:
#对每页的帖子发送请求,获取帖子内容里面指定数据返回给管道文件
yieldscrapy.Request(each,callback=self.deal_link)
self.offset+=30
ifself.offset<=124260:
url='http://wz.sun0769.com/index.php/question/report?page='+str(self.offset)
#对指定分页发送请求,响应交给parse函数处理
yieldscrapy.Request(url,callback=self.parse)
#从每个分页帖子内容获取数据,返回给管道
defdeal_link(self,response):
item=NewdongguanItem()
item['url']=response.url
item['title']=response.xpath("//div[@class='pagecenterp3']//strong[@class='tgray14']/text()").extract()[0]
item['number']=response.xpath("//div[@class='pagecenterp3']//strong[@class='tgray14']/text()").extract()[0].split('')[-1].split(':')[-1]
iflen(response.xpath("//div[@class='contentext']"))==0:
item['content']=''.join(response.xpath("//div[@class='c1text14_2']/text()").extract())
else:
item['content']=''.join(response.xpath("//div[@class='contentext']/text()").extract())
yielditem
编写管道pipelines.py文件
#-*-coding:utf-8-*-
importcodecs
importjson
classNewdongguanPipeline(object):
def__init__(self):
#使用codecs写文件,直接设置文件内容编码格式,省去每次都要对内容进行编码
self.file=codecs.open('newdongguan.json','w',encoding='utf-8')
#以前文件写法
#self.file=open('newdongguan.json','w')
defprocess_item(self,item,spider):
print(item['title'])
content=json.dumps(dict(item),ensure_ascii=False)+'\n'
#以前文件写法
#self.file.write(content.encode('utf-8'))
self.file.write(content)
returnitem
defclose_spider(self):
self.file.close()
编写settings.py文件
#-*-coding:utf-8-*-
BOT_NAME='newdongguan'
SPIDER_MODULES=['newdongguan.spiders']
NEWSPIDER_MODULE='newdongguan.spiders'
#Crawlresponsiblybyidentifyingyourself(andyourwebsite)ontheuser-agent
#USER_AGENT='newdongguan(+http://www.yourdomain.com)'
USER_AGENT='User-Agent:Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0;'
#Configureitempipelines
#Seehttps://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES={
'newdongguan.pipelines.NewdongguanPipeline':300,
}
测试运行爬虫,终端执行命
srapycrawlndg
备注:markdown语法关于代码块缩进问题,可通过tab键来解决。而简单文本则可以通过回车键来解决,如Spider版流程如下:和1.创建爬虫项目newdongguang
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。