scrapy spider的几种爬取方式实例代码
本节课介绍了scrapy的爬虫框架,重点说了scrapy组件spider。
spider的几种爬取方式:
- 爬取1页内容
- 按照给定列表拼出链接爬取多页
- 找到‘下一页'标签进行爬取
- 进入链接,按照链接进行爬取
下面分别给出了示例
1.爬取1页内容
#by寒小阳(hanxiaoyang.ml@gmail.com)
importscrapy
classJulyeduSpider(scrapy.Spider):
name="julyedu"
start_urls=[
'https://www.julyedu.com/category/index',
]
defparse(self,response):
forjulyedu_classinresponse.xpath('//div[@class="course_info_box"]'):
printjulyedu_class.xpath('a/h4/text()').extract_first()
printjulyedu_class.xpath('a/p[@class="course-info-tip"][1]/text()').extract_first()
printjulyedu_class.xpath('a/p[@class="course-info-tip"][2]/text()').extract_first()
printresponse.urljoin(julyedu_class.xpath('a/img[1]/@src').extract_first())
print"\n"
yield{
'title':julyedu_class.xpath('a/h4/text()').extract_first(),
'desc':julyedu_class.xpath('a/p[@class="course-info-tip"][1]/text()').extract_first(),
'time':julyedu_class.xpath('a/p[@class="course-info-tip"][2]/text()').extract_first(),
'img_url':response.urljoin(julyedu_class.xpath('a/img[1]/@src').extract_first())
}
2.按照给定列表拼出链接爬取多页
#by寒小阳(hanxiaoyang.ml@gmail.com)
importscrapy
classCnBlogSpider(scrapy.Spider):
name="cnblogs"
allowed_domains=["cnblogs.com"]
start_urls=[
'http://www.cnblogs.com/pick/#p%s'%pforpinxrange(1,11)
]
defparse(self,response):
forarticleinresponse.xpath('//div[@class="post_item"]'):
printarticle.xpath('div[@class="post_item_body"]/h3/a/text()').extract_first().strip()
printresponse.urljoin(article.xpath('div[@class="post_item_body"]/h3/a/@href').extract_first()).strip()
printarticle.xpath('div[@class="post_item_body"]/p/text()').extract_first().strip()
printarticle.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/a/text()').extract_first().strip()
printresponse.urljoin(article.xpath('div[@class="post_item_body"]/div/a/@href').extract_first()).strip()
printarticle.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_comment"]/a/text()').extract_first().strip()
printarticle.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_view"]/a/text()').extract_first().strip()
print""
yield{
'title':article.xpath('div[@class="post_item_body"]/h3/a/text()').extract_first().strip(),
'link':response.urljoin(article.xpath('div[@class="post_item_body"]/h3/a/@href').extract_first()).strip(),
'summary':article.xpath('div[@class="post_item_body"]/p/text()').extract_first().strip(),
'author':article.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/a/text()').extract_first().strip(),
'author_link':response.urljoin(article.xpath('div[@class="post_item_body"]/div/a/@href').extract_first()).strip(),
'comment':article.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_comment"]/a/text()').extract_first().strip(),
'view':article.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_view"]/a/text()').extract_first().strip(),
}
3.找到‘下一页'标签进行爬取
importscrapy
classQuotesSpider(scrapy.Spider):
name="quotes"
start_urls=[
'http://quotes.toscrape.com/tag/humor/',
]
defparse(self,response):
forquoteinresponse.xpath('//div[@class="quote"]'):
yield{
'text':quote.xpath('span[@class="text"]/text()').extract_first(),
'author':quote.xpath('span/small[@class="author"]/text()').extract_first(),
}
next_page=response.xpath('//li[@class="next"]/@herf').extract_first()
ifnext_pageisnotNone:
next_page=response.urljoin(next_page)
yieldscrapy.Request(next_page,callback=self.parse)
4.进入链接,按照链接进行爬取
#by寒小阳(hanxiaoyang.ml@gmail.com)
importscrapy
classQQNewsSpider(scrapy.Spider):
name='qqnews'
start_urls=['http://news.qq.com/society_index.shtml']
defparse(self,response):
forhrefinresponse.xpath('//*[@id="news"]/div/div/div/div/em/a/@href'):
full_url=response.urljoin(href.extract())
yieldscrapy.Request(full_url,callback=self.parse_question)
defparse_question(self,response):
printresponse.xpath('//div[@class="qq_article"]/div/h1/text()').extract_first()
printresponse.xpath('//span[@class="a_time"]/text()').extract_first()
printresponse.xpath('//span[@class="a_catalog"]/a/text()').extract_first()
print"\n".join(response.xpath('//div[@id="Cnt-Main-Article-QQ"]/p[@class="text"]/text()').extract())
print""
yield{
'title':response.xpath('//div[@class="qq_article"]/div/h1/text()').extract_first(),
'content':"\n".join(response.xpath('//div[@id="Cnt-Main-Article-QQ"]/p[@class="text"]/text()').extract()),
'time':response.xpath('//span[@class="a_time"]/text()').extract_first(),
'cate':response.xpath('//span[@class="a_catalog"]/a/text()').extract_first(),
}
总结
以上就是本文关于scrapyspider的几种爬取方式实例代码的全部内容,希望对大家有所帮助。感兴趣的朋友可以继续参阅本站其他相关专题,如有不足之处,欢迎留言指出。感谢朋友们对本站的支持!