scrapy spider的几种爬取方式实例代码

进入链接，按照链接进行爬取

下面分别给出了示例

1.爬取1页内容

#by寒小阳(hanxiaoyang.ml@gmail.com)

importscrapy


classJulyeduSpider(scrapy.Spider):
name="julyedu"
start_urls=[
'https://www.julyedu.com/category/index',
]

defparse(self,response):
forjulyedu_classinresponse.xpath('//div[@class="course_info_box"]'):
printjulyedu_class.xpath('a/h4/text()').extract_first()
printjulyedu_class.xpath('a/p[@class="course-info-tip"][1]/text()').extract_first()
printjulyedu_class.xpath('a/p[@class="course-info-tip"][2]/text()').extract_first()
printresponse.urljoin(julyedu_class.xpath('a/img[1]/@src').extract_first())
print"\n"

yield{
'title':julyedu_class.xpath('a/h4/text()').extract_first(),
'desc':julyedu_class.xpath('a/p[@class="course-info-tip"][1]/text()').extract_first(),
'time':julyedu_class.xpath('a/p[@class="course-info-tip"][2]/text()').extract_first(),
'img_url':response.urljoin(julyedu_class.xpath('a/img[1]/@src').extract_first())
}

2.按照给定列表拼出链接爬取多页

#by寒小阳(hanxiaoyang.ml@gmail.com)

importscrapy


classCnBlogSpider(scrapy.Spider):
name="cnblogs"
allowed_domains=["cnblogs.com"]
start_urls=[
'http://www.cnblogs.com/pick/#p%s'%pforpinxrange(1,11)
]

defparse(self,response):
forarticleinresponse.xpath('//div[@class="post_item"]'):
printarticle.xpath('div[@class="post_item_body"]/h3/a/text()').extract_first().strip()
printresponse.urljoin(article.xpath('div[@class="post_item_body"]/h3/a/@href').extract_first()).strip()
printarticle.xpath('div[@class="post_item_body"]/p/text()').extract_first().strip()
printarticle.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/a/text()').extract_first().strip()
printresponse.urljoin(article.xpath('div[@class="post_item_body"]/div/a/@href').extract_first()).strip()
printarticle.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_comment"]/a/text()').extract_first().strip()
printarticle.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_view"]/a/text()').extract_first().strip()
print""

yield{
'title':article.xpath('div[@class="post_item_body"]/h3/a/text()').extract_first().strip(),
'link':response.urljoin(article.xpath('div[@class="post_item_body"]/h3/a/@href').extract_first()).strip(),
'summary':article.xpath('div[@class="post_item_body"]/p/text()').extract_first().strip(),
'author':article.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/a/text()').extract_first().strip(),
'author_link':response.urljoin(article.xpath('div[@class="post_item_body"]/div/a/@href').extract_first()).strip(),
'comment':article.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_comment"]/a/text()').extract_first().strip(),
'view':article.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_view"]/a/text()').extract_first().strip(),
}

3.找到‘下一页'标签进行爬取

importscrapy
classQuotesSpider(scrapy.Spider):
name="quotes"
start_urls=[
'http://quotes.toscrape.com/tag/humor/',
]

defparse(self,response):
forquoteinresponse.xpath('//div[@class="quote"]'):
yield{
'text':quote.xpath('span[@class="text"]/text()').extract_first(),
'author':quote.xpath('span/small[@class="author"]/text()').extract_first(),
}

next_page=response.xpath('//li[@class="next"]/@herf').extract_first()
ifnext_pageisnotNone:
next_page=response.urljoin(next_page)
yieldscrapy.Request(next_page,callback=self.parse)

4.进入链接，按照链接进行爬取

#by寒小阳(hanxiaoyang.ml@gmail.com)

importscrapy


classQQNewsSpider(scrapy.Spider):
name='qqnews'
start_urls=['http://news.qq.com/society_index.shtml']

defparse(self,response):
forhrefinresponse.xpath('//*[@id="news"]/div/div/div/div/em/a/@href'):
full_url=response.urljoin(href.extract())
yieldscrapy.Request(full_url,callback=self.parse_question)

defparse_question(self,response):
printresponse.xpath('//div[@class="qq_article"]/div/h1/text()').extract_first()
printresponse.xpath('//span[@class="a_time"]/text()').extract_first()
printresponse.xpath('//span[@class="a_catalog"]/a/text()').extract_first()
print"\n".join(response.xpath('//div[@id="Cnt-Main-Article-QQ"]/p[@class="text"]/text()').extract())
print""
yield{
'title':response.xpath('//div[@class="qq_article"]/div/h1/text()').extract_first(),
'content':"\n".join(response.xpath('//div[@id="Cnt-Main-Article-QQ"]/p[@class="text"]/text()').extract()),
'time':response.xpath('//span[@class="a_time"]/text()').extract_first(),
'cate':response.xpath('//span[@class="a_catalog"]/a/text()').extract_first(),
}

总结

以上就是本文关于scrapyspider的几种爬取方式实例代码的全部内容，希望对大家有所帮助。感兴趣的朋友可以继续参阅本站其他相关专题，如有不足之处，欢迎留言指出。感谢朋友们对本站的支持！

返回顶部
3162201930
czq8825@qq.com

热门推荐

随机推荐