Python大数据之从网页上爬取数据的方法详解
本文实例讲述了Python大数据之从网页上爬取数据的方法。分享给大家供大家参考,具体如下:
myspider.py :
#!/usr/bin/python
#-*-coding:utf-8-*-
fromscrapy.spidersimportSpider
fromlxmlimportetree
fromjredu.itemsimportJreduItem
classJreduSpider(Spider):
name='tt'#爬虫的名字,必须的,唯一的
allowed_domains=['sohu.com']
start_urls=[
'http://www.sohu.com'
]
defparse(self,response):
content=response.body.decode('utf-8')
dom=etree.HTML(content)
forulindom.xpath("//div[@class='focus-news-box']/div[@class='list16']/ul"):
lis=ul.xpath("./li")
forliinlis:
item=JreduItem()#定义对象
iful.index(li)==0:
strong=li.xpath("./a/strong/text()")
li.xpath("./a/@href")
item['title']=strong[0]
item['href']=li.xpath("./a/@href")[0]
else:
la=li.xpath("./a[last()]/text()")
item['title']=la[0]
item['href']=li.xpath("./a[last()]/href")[0]
yielditem
items.py :
#-*-coding:utf-8-*- #Defineherethemodelsforyourscrapeditems # #Seedocumentationin: #http://doc.scrapy.org/en/latest/topics/items.html importscrapy classJreduItem(scrapy.Item):#相当于Java里的实体类 #definethefieldsforyouritemherelike: #name=scrapy.Field() title=scrapy.Field()#创建一个field对象 href=scrapy.Field() pass
middlewares.py :
#-*-coding:utf-8-*-
#Defineherethemodelsforyourspidermiddleware
#
#Seedocumentationin:
#http://doc.scrapy.org/en/latest/topics/spider-middleware.html
fromscrapyimportsignals
classJreduSpiderMiddleware(object):
#Notallmethodsneedtobedefined.Ifamethodisnotdefined,
#scrapyactsasifthespidermiddlewaredoesnotmodifythe
#passedobjects.
@classmethod
deffrom_crawler(cls,crawler):
#ThismethodisusedbyScrapytocreateyourspiders.
s=cls()
crawler.signals.connect(s.spider_opened,signal=signals.spider_opened)
returns
defprocess_spider_input(self,response,spider):
#Calledforeachresponsethatgoesthroughthespider
#middlewareandintothespider.
#ShouldreturnNoneorraiseanexception.
returnNone
defprocess_spider_output(self,response,result,spider):
#CalledwiththeresultsreturnedfromtheSpider,after
#ithasprocessedtheresponse.
#MustreturnaniterableofRequest,dictorItemobjects.
foriinresult:
yieldi
defprocess_spider_exception(self,response,exception,spider):
#Calledwhenaspiderorprocess_spider_input()method
#(fromotherspidermiddleware)raisesanexception.
#ShouldreturneitherNoneoraniterableofResponse,dict
#orItemobjects.
pass
defprocess_start_requests(self,start_requests,spider):
#Calledwiththestartrequestsofthespider,andworks
#similarlytotheprocess_spider_output()method,except
#thatitdoesn'thavearesponseassociated.
#Mustreturnonlyrequests(notitems).
forrinstart_requests:
yieldr
defspider_opened(self,spider):
spider.logger.info('Spideropened:%s'%spider.name)
pipelines.py :
#-*-coding:utf-8-*-
#Defineyouritempipelineshere
#
#Don'tforgettoaddyourpipelinetotheITEM_PIPELINESsetting
#See:http://doc.scrapy.org/en/latest/topics/item-pipeline.html
importcodecs
importjson
classJreduPipeline(object):
def__init__(self):
self.fill=codecs.open("data.txt",encoding="utf-8",mode="wb");
defprocess_item(self,item,spider):
line=json.dumps(dict(item))+"\n"
self.fill.write(line)
returnitem
settings.py :
#-*-coding:utf-8-*-
#Scrapysettingsforjreduproject
#
#Forsimplicity,thisfilecontainsonlysettingsconsideredimportantor
#commonlyused.Youcanfindmoresettingsconsultingthedocumentation:
#
#http://doc.scrapy.org/en/latest/topics/settings.html
#http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME='jredu'
SPIDER_MODULES=['jredu.spiders']
NEWSPIDER_MODULE='jredu.spiders'
#Crawlresponsiblybyidentifyingyourself(andyourwebsite)ontheuser-agent
#USER_AGENT='jredu(+http://www.yourdomain.com)'
#Obeyrobots.txtrules
ROBOTSTXT_OBEY=True
#ConfiguremaximumconcurrentrequestsperformedbyScrapy(default:16)
#CONCURRENT_REQUESTS=32
#Configureadelayforrequestsforthesamewebsite(default:0)
#Seehttp://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
#Seealsoautothrottlesettingsanddocs
#DOWNLOAD_DELAY=3
#Thedownloaddelaysettingwillhonoronlyoneof:
#CONCURRENT_REQUESTS_PER_DOMAIN=16
#CONCURRENT_REQUESTS_PER_IP=16
#Disablecookies(enabledbydefault)
#COOKIES_ENABLED=False
#DisableTelnetConsole(enabledbydefault)
#TELNETCONSOLE_ENABLED=False
#Overridethedefaultrequestheaders:
#DEFAULT_REQUEST_HEADERS={
#'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#'Accept-Language':'en',
#}
#Enableordisablespidermiddlewares
#Seehttp://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES={
#'jredu.middlewares.JreduSpiderMiddleware':543,
#}
#Enableordisabledownloadermiddlewares
#Seehttp://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES={
#'jredu.middlewares.MyCustomDownloaderMiddleware':543,
#}
#Enableordisableextensions
#Seehttp://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS={
#'scrapy.extensions.telnet.TelnetConsole':None,
#}
#Configureitempipelines
#Seehttp://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES={
'jredu.pipelines.JreduPipeline':300,
}
#EnableandconfiguretheAutoThrottleextension(disabledbydefault)
#Seehttp://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED=True
#Theinitialdownloaddelay
#AUTOTHROTTLE_START_DELAY=5
#Themaximumdownloaddelaytobesetincaseofhighlatencies
#AUTOTHROTTLE_MAX_DELAY=60
#TheaveragenumberofrequestsScrapyshouldbesendinginparallelto
#eachremoteserver
#AUTOTHROTTLE_TARGET_CONCURRENCY=1.0
#Enableshowingthrottlingstatsforeveryresponsereceived:
#AUTOTHROTTLE_DEBUG=False
#EnableandconfigureHTTPcaching(disabledbydefault)
#Seehttp://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED=True
#HTTPCACHE_EXPIRATION_SECS=0
#HTTPCACHE_DIR='httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES=[]
#HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
最后需要一个程序入口的方法:
main.py :
#!/usr/bin/python
#-*-coding:utf-8-*-
#爬虫文件的执行入口
fromscrapyimportcmdline
cmdline.execute("scrapycrawltt".split())
更多关于Python相关内容可查看本站专题:《PythonSocket编程技巧总结》、《Python正则表达式用法总结》、《Python数据结构与算法教程》、《Python函数使用技巧总结》、《Python字符串操作技巧汇总》、《Python入门与进阶经典教程》及《Python文件与目录操作技巧汇总》
希望本文所述对大家Python程序设计有所帮助。
声明:本文内容来源于网络,版权归原作者所有,内容由互联网用户自发贡献自行上传,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任。如果您发现有涉嫌版权的内容,欢迎发送邮件至:czq8825#qq.com(发邮件时,请将#更换为@)进行举报,并提供相关证据,一经查实,本站将立刻删除涉嫌侵权内容。