python爬虫库scrapy简单使用实例详解

2023-08-01 20:07:04 332

最近因为项目需求，需要写个爬虫爬取一些题库。在这之前爬虫我都是用node或者php写的。一直听说python写爬虫有一手，便入手了python的爬虫框架scrapy.

下面简单的介绍一下scrapy的目录结构与使用：

首先我们得安装scrapy框架

pipinstallscrapy

接着使用scrapy命令创建一个爬虫项目：

scrapystartprojectquestions

相关文件简介：

scrapy.cfg:项目的配置文件

questions/:该项目的python模块。之后您将在此加入代码。

questions/items.py:项目中的item文件.

questions/pipelines.py:项目中的pipelines文件.

questions/settings.py:项目的设置文件.

questions/spiders/:放置spider代码的目录.

questions/spiders/xueersi.py:实现爬虫的主体代码.

xueersi.py 爬虫主体

#-*-coding:utf-8-*-
importscrapy
importtime
importnumpy
importre
fromquestions.itemsimportQuestionsItem
classxueersiSpider(scrapy.Spider):
name="xueersi"#爬虫名字
allowed_domains=["tiku.xueersi.com"]#目标的域名
#爬取的目标地址
start_urls=[
"http://tiku.xueersi.com/shiti/list_1_1_0_0_4_0_1",
"http://tiku.xueersi.com/shiti/list_1_2_0_0_4_0_1",
"http://tiku.xueersi.com/shiti/list_1_3_0_0_4_0_1",
]
levels=['偏易','中档','偏难']
subjects=['英语','语文','数学']
#爬虫开始的时候，自动调用该方法，如果该方法不存在会自动调用parse方法
#defstart_requests(self):
#yieldscrapy.Request('http://tiku.xueersi.com/shiti/list_1_2_0_0_4_0_39',callback=self.getquestion)
#start_requests方法不存在时，parse方法自动被调用
defparse(self,response):
#xpath的选择器语法不多介绍，可以直接查看官方文档
arr=response.xpath("//ul[@class='pagination']/li/a/text()").extract()
total_page=arr[3]
#获取分页
forindexinrange(int(total_page)):
yieldscrapy.Request(response.url.replace('_0_0_4_0_1',"_0_0_4_0_"+str(index)),callback=self.getquestion)#发出新的请求，获取每个分页所有题目
#获取题目
defgetquestion(self,response):
forresinresponse.xpath('//div[@class="main-wrap"]/ul[@class="items"]/li'):
item=QuestionsItem()#实例化Item类
#获取问题
questions=res.xpath('./div[@class="content-area"]').re(r'?([\s\S]+?)<(table|\/td|div|br)')
iflen(questions):
#获取题目
question=questions[0].strip()
item['source']=question
dr=re.compile(r'<[^>]+>',re.S)
question=dr.sub('',question)
content=res.extract()
item['content']=question
#获取课目
subject=re.findall(ur'http:\/\/tiku\.xueersi\.com\/shiti\/list_1_(\d+)',response.url)
item['subject']=self.subjects[int(subject[0])-1]
#获取难度等级
levels=res.xpath('//div[@class="info"]').re(ur'难度：([\s\S]+?)<')
item['level']=self.levels.index(levels[0])+1

#获取选项
options=re.findall(ur'[A-D][\.．]([\s\S]+?)<(\/td|\/p|br)',content)
item['options']=options
iflen(options):
url=res.xpath('./div[@class="info"]/a/@href').extract()[0]
request=scrapy.Request(url,callback=self.getanswer)
request.meta['item']=item#缓存item数据，传递给下一个请求
yieldrequest
#foroptioninoptions:
#获取答案
defgetanswer(self,response):

res=response.xpath('//div[@class="part"]').re(ur'([\s\S]+?)<\/td>')
con=re.findall(ur'([\s\S]+?)
[\s\S]+?([A-D])',res[0])#获取含有解析的答案
ifcon:
answer=con[0][1]
analysis=con[0][0]#获取解析
else:
answer=res[0]
analysis=''
ifanswer:
item=response.meta['item']#获取item
item['answer']=answer.strip()
item['analysis']=analysis.strip()
item['answer_url']=response.url
yielditem#返回item,输出管道（pipelines.py）会自动接收该数据

items.py数据结构定义:

#-*-coding:utf-8-*-
#Defineherethemodelsforyourscrapeditems
#
#Seedocumentationin:
#https://doc.scrapy.org/en/latest/topics/items.html
importscrapy
classQuestionsItem(scrapy.Item):
content=scrapy.Field()
subject=scrapy.Field()
level=scrapy.Field()
answer=scrapy.Field()
options=scrapy.Field()
analysis=scrapy.Field()
source=scrapy.Field()
answer_url=scrapy.Field()
pass

pipelines.py输出管道（本例子输出的数据写入本地数据库）：

#-*-coding:utf-8-*-
#Defineyouritempipelineshere
#
#Don'tforgettoaddyourpipelinetotheITEM_PIPELINESsetting
#See:https://doc.scrapy.org/en/latest/topics/item-pipeline.html
importpymysql
importmd5
classQuestionsPipeline(object):
def__init__(self):
#建立数据库连接
self.connect=pymysql.connect('localhost','root','','question',use_unicode=True,charset='utf8')
#获取游标
self.cursor=self.connect.cursor()
print("connectingmysqlsuccess!")
self.answer=['A','B','C','D']
defprocess_item(self,item,spider):
content=pymysql.escape_string(item['content'])
#获取题目hash值，使用该字段过滤重复的题目
m1=md5.new()
m1.update(content)
hash=m1.hexdigest()
selectstr="selectidfromquestionwherehash='%s'"%(hash)
self.cursor.execute(selectstr)
res=self.cursor.fetchone()
#过滤相同的题目
ifnotres:
#插入题目
sqlstr="insertintoquestion(content,source,subject,level,answer,analysis,hash,answer_url)VALUES('%s','%s','%s','%s','%s','%s','%s','%s')"%(content,pymysql.escape_string(item['source']),item['subject'],item['level'],item['answer'],pymysql.escape_string(item['analysis']),hash,item['answer_url'])
self.cursor.execute(sqlstr)
qid=self.cursor.lastrowid
#插入选项
forindexinrange(len(item['options'])):
option=item['options'][index]
answer=self.answer.index(item['answer'])
ifanswer==index:
ans='2'
else:
ans='1'
sqlstr="insertintooptions(content,qid,answer)VALUES('%s','%s','%s')"%(pymysql.escape_string(option[0]),qid,ans)
self.cursor.execute(sqlstr)
self.connect.commit()
#self.connect.close()
returnitem

爬虫构建完毕后，在项目的根目录下运行

scrapycrawlxueersi#scrapycrawl爬虫的名称

更多关于python爬虫库scrapy使用方法请查看下面的相关链接

声明：本文内容来源于网络，版权归原作者所有，内容由互联网用户自发贡献自行上传，本网站不拥有所有权，未作人工编辑处理，也不承担相关法律责任。如果您发现有涉嫌版权的内容，欢迎发送邮件至：czq8825#qq.com（发邮件时，请将#更换为@）进行举报，并提供相关证据，一经查实，本站将立刻删除涉嫌侵权内容。

python爬虫库scrapy简单使用实例详解

热门推荐

随机推荐