scrapy spider的几种爬取方式实例代码
本节课介绍了scrapy的爬虫框架,重点说了scrapy组件spider。
spider的几种爬取方式:
- 爬取1页内容
- 按照给定列表拼出链接爬取多页
- 找到‘下一页'标签进行爬取
- 进入链接,按照链接进行爬取
下面分别给出了示例
1.爬取1页内容
#by寒小阳(hanxiaoyang.ml@gmail.com) importscrapy classJulyeduSpider(scrapy.Spider): name="julyedu" start_urls=[ 'https://www.julyedu.com/category/index', ] defparse(self,response): forjulyedu_classinresponse.xpath('//div[@class="course_info_box"]'): printjulyedu_class.xpath('a/h4/text()').extract_first() printjulyedu_class.xpath('a/p[@class="course-info-tip"][1]/text()').extract_first() printjulyedu_class.xpath('a/p[@class="course-info-tip"][2]/text()').extract_first() printresponse.urljoin(julyedu_class.xpath('a/img[1]/@src').extract_first()) print"\n" yield{ 'title':julyedu_class.xpath('a/h4/text()').extract_first(), 'desc':julyedu_class.xpath('a/p[@class="course-info-tip"][1]/text()').extract_first(), 'time':julyedu_class.xpath('a/p[@class="course-info-tip"][2]/text()').extract_first(), 'img_url':response.urljoin(julyedu_class.xpath('a/img[1]/@src').extract_first()) }
2.按照给定列表拼出链接爬取多页
#by寒小阳(hanxiaoyang.ml@gmail.com) importscrapy classCnBlogSpider(scrapy.Spider): name="cnblogs" allowed_domains=["cnblogs.com"] start_urls=[ 'http://www.cnblogs.com/pick/#p%s'%pforpinxrange(1,11) ] defparse(self,response): forarticleinresponse.xpath('//div[@class="post_item"]'): printarticle.xpath('div[@class="post_item_body"]/h3/a/text()').extract_first().strip() printresponse.urljoin(article.xpath('div[@class="post_item_body"]/h3/a/@href').extract_first()).strip() printarticle.xpath('div[@class="post_item_body"]/p/text()').extract_first().strip() printarticle.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/a/text()').extract_first().strip() printresponse.urljoin(article.xpath('div[@class="post_item_body"]/div/a/@href').extract_first()).strip() printarticle.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_comment"]/a/text()').extract_first().strip() printarticle.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_view"]/a/text()').extract_first().strip() print"" yield{ 'title':article.xpath('div[@class="post_item_body"]/h3/a/text()').extract_first().strip(), 'link':response.urljoin(article.xpath('div[@class="post_item_body"]/h3/a/@href').extract_first()).strip(), 'summary':article.xpath('div[@class="post_item_body"]/p/text()').extract_first().strip(), 'author':article.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/a/text()').extract_first().strip(), 'author_link':response.urljoin(article.xpath('div[@class="post_item_body"]/div/a/@href').extract_first()).strip(), 'comment':article.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_comment"]/a/text()').extract_first().strip(), 'view':article.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_view"]/a/text()').extract_first().strip(), }
3.找到‘下一页'标签进行爬取
importscrapy classQuotesSpider(scrapy.Spider): name="quotes" start_urls=[ 'http://quotes.toscrape.com/tag/humor/', ] defparse(self,response): forquoteinresponse.xpath('//div[@class="quote"]'): yield{ 'text':quote.xpath('span[@class="text"]/text()').extract_first(), 'author':quote.xpath('span/small[@class="author"]/text()').extract_first(), } next_page=response.xpath('//li[@class="next"]/@herf').extract_first() ifnext_pageisnotNone: next_page=response.urljoin(next_page) yieldscrapy.Request(next_page,callback=self.parse)
4.进入链接,按照链接进行爬取
#by寒小阳(hanxiaoyang.ml@gmail.com) importscrapy classQQNewsSpider(scrapy.Spider): name='qqnews' start_urls=['http://news.qq.com/society_index.shtml'] defparse(self,response): forhrefinresponse.xpath('//*[@id="news"]/div/div/div/div/em/a/@href'): full_url=response.urljoin(href.extract()) yieldscrapy.Request(full_url,callback=self.parse_question) defparse_question(self,response): printresponse.xpath('//div[@class="qq_article"]/div/h1/text()').extract_first() printresponse.xpath('//span[@class="a_time"]/text()').extract_first() printresponse.xpath('//span[@class="a_catalog"]/a/text()').extract_first() print"\n".join(response.xpath('//div[@id="Cnt-Main-Article-QQ"]/p[@class="text"]/text()').extract()) print"" yield{ 'title':response.xpath('//div[@class="qq_article"]/div/h1/text()').extract_first(), 'content':"\n".join(response.xpath('//div[@id="Cnt-Main-Article-QQ"]/p[@class="text"]/text()').extract()), 'time':response.xpath('//span[@class="a_time"]/text()').extract_first(), 'cate':response.xpath('//span[@class="a_catalog"]/a/text()').extract_first(), }
总结
以上就是本文关于scrapyspider的几种爬取方式实例代码的全部内容,希望对大家有所帮助。感兴趣的朋友可以继续参阅本站其他相关专题,如有不足之处,欢迎留言指出。感谢朋友们对本站的支持!