• 进入链接,按照链接进行爬取
  • 下面分别给出了示例

    1.爬取1页内容

    #by寒小阳(hanxiaoyang.ml@gmail.com)
    
    importscrapy
    
    
    classJulyeduSpider(scrapy.Spider):
    name="julyedu"
    start_urls=[
    'https://www.julyedu.com/category/index',
    ]
    
    defparse(self,response):
    forjulyedu_classinresponse.xpath('//div[@class="course_info_box"]'):
    printjulyedu_class.xpath('a/h4/text()').extract_first()
    printjulyedu_class.xpath('a/p[@class="course-info-tip"][1]/text()').extract_first()
    printjulyedu_class.xpath('a/p[@class="course-info-tip"][2]/text()').extract_first()
    printresponse.urljoin(julyedu_class.xpath('a/img[1]/@src').extract_first())
    print"\n"
    
    yield{
    'title':julyedu_class.xpath('a/h4/text()').extract_first(),
    'desc':julyedu_class.xpath('a/p[@class="course-info-tip"][1]/text()').extract_first(),
    'time':julyedu_class.xpath('a/p[@class="course-info-tip"][2]/text()').extract_first(),
    'img_url':response.urljoin(julyedu_class.xpath('a/img[1]/@src').extract_first())
    }
    

    2.按照给定列表拼出链接爬取多页

    #by寒小阳(hanxiaoyang.ml@gmail.com)
    
    importscrapy
    
    
    classCnBlogSpider(scrapy.Spider):
    name="cnblogs"
    allowed_domains=["cnblogs.com"]
    start_urls=[
    'http://www.cnblogs.com/pick/#p%s'%pforpinxrange(1,11)
    ]
    
    defparse(self,response):
    forarticleinresponse.xpath('//div[@class="post_item"]'):
    printarticle.xpath('div[@class="post_item_body"]/h3/a/text()').extract_first().strip()
    printresponse.urljoin(article.xpath('div[@class="post_item_body"]/h3/a/@href').extract_first()).strip()
    printarticle.xpath('div[@class="post_item_body"]/p/text()').extract_first().strip()
    printarticle.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/a/text()').extract_first().strip()
    printresponse.urljoin(article.xpath('div[@class="post_item_body"]/div/a/@href').extract_first()).strip()
    printarticle.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_comment"]/a/text()').extract_first().strip()
    printarticle.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_view"]/a/text()').extract_first().strip()
    print""
    
    yield{
    'title':article.xpath('div[@class="post_item_body"]/h3/a/text()').extract_first().strip(),
    'link':response.urljoin(article.xpath('div[@class="post_item_body"]/h3/a/@href').extract_first()).strip(),
    'summary':article.xpath('div[@class="post_item_body"]/p/text()').extract_first().strip(),
    'author':article.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/a/text()').extract_first().strip(),
    'author_link':response.urljoin(article.xpath('div[@class="post_item_body"]/div/a/@href').extract_first()).strip(),
    'comment':article.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_comment"]/a/text()').extract_first().strip(),
    'view':article.xpath('div[@class="post_item_body"]/div[@class="post_item_foot"]/span[@class="article_view"]/a/text()').extract_first().strip(),
    }

    3.找到‘下一页'标签进行爬取

    importscrapy
    classQuotesSpider(scrapy.Spider):
    name="quotes"
    start_urls=[
    'http://quotes.toscrape.com/tag/humor/',
    ]
    
    defparse(self,response):
    forquoteinresponse.xpath('//div[@class="quote"]'):
    yield{
    'text':quote.xpath('span[@class="text"]/text()').extract_first(),
    'author':quote.xpath('span/small[@class="author"]/text()').extract_first(),
    }
    
    next_page=response.xpath('//li[@class="next"]/@herf').extract_first()
    ifnext_pageisnotNone:
    next_page=response.urljoin(next_page)
    yieldscrapy.Request(next_page,callback=self.parse)

    4.进入链接,按照链接进行爬取

    #by寒小阳(hanxiaoyang.ml@gmail.com)
    
    importscrapy
    
    
    classQQNewsSpider(scrapy.Spider):
    name='qqnews'
    start_urls=['http://news.qq.com/society_index.shtml']
    
    defparse(self,response):
    forhrefinresponse.xpath('//*[@id="news"]/div/div/div/div/em/a/@href'):
    full_url=response.urljoin(href.extract())
    yieldscrapy.Request(full_url,callback=self.parse_question)
    
    defparse_question(self,response):
    printresponse.xpath('//div[@class="qq_article"]/div/h1/text()').extract_first()
    printresponse.xpath('//span[@class="a_time"]/text()').extract_first()
    printresponse.xpath('//span[@class="a_catalog"]/a/text()').extract_first()
    print"\n".join(response.xpath('//div[@id="Cnt-Main-Article-QQ"]/p[@class="text"]/text()').extract())
    print""
    yield{
    'title':response.xpath('//div[@class="qq_article"]/div/h1/text()').extract_first(),
    'content':"\n".join(response.xpath('//div[@id="Cnt-Main-Article-QQ"]/p[@class="text"]/text()').extract()),
    'time':response.xpath('//span[@class="a_time"]/text()').extract_first(),
    'cate':response.xpath('//span[@class="a_catalog"]/a/text()').extract_first(),
    }

    总结

    以上就是本文关于scrapyspider的几种爬取方式实例代码的全部内容,希望对大家有所帮助。感兴趣的朋友可以继续参阅本站其他相关专题,如有不足之处,欢迎留言指出。感谢朋友们对本站的支持!

    热门推荐

    1 新春进门祝福语大全简短
    2 新郎表白成语祝福语简短
    3 参加儿子大学祝福语简短
    4 订婚新发言简短祝福语
    5 日语送考祝福语简短
    6 夸赞老师祝福语搞笑简短
    7 老师对学校祝福语简短
    8 祝福语怎么写大全简短
    9 对疫情的简短祝福语
    10 小红书平安祝福语简短
    11 生日祝福语大全女孩简短
    12 收生日红包祝福语 简短
    13 领证幽默祝福语简短
    14 法考面试祝福语简短
    15 老哥出门祝福语简短语
    16 送灯祝福语简短独特
    17 幼儿狗年祝福语大全简短
    18 好听的元旦简短祝福语