Flask中集成Scrapy
如果只是在Flask中调用Scrapy爬虫,可能会遇到如下错误:
ValueError:signalonlyworksinmainthread #或者 twisted.internet.error.ReactorNotRestartable
解决的办法有几个。
1使用python子进程(subproccess)
首先,确保目录结构类似如下:
>tree-L1 ├──dirbot ├──README.rst ├──scrapy.cfg ├──server.py └──setup.py
然后在,新进程中启动爬虫:
#server.py
importsubprocess
fromflaskimportFlask
app=Flask(__name__)
@app.route('/')
defhello_world():
"""
Runspiderinanotherprocessandstoreitemsinfile.Simplyissuecommand:
>scrapycrawldmoz-o"output.json"
waitforthiscommandtofinish,andreadoutput.jsontoclient.
"""
spider_name="dmoz"
subprocess.check_output(['scrapy','crawl',spider_name,"-o","output.json"])
withopen("output.json")asitems_file:
returnitems_file.read()
if__name__=='__main__':
app.run(debug=True)
新进程中启动爬虫:
2使用Twisted-Klein+Scrapy
代码如下:
#server.py
importjson
fromkleinimportroute,run
fromscrapyimportsignals
fromscrapy.crawlerimportCrawlerRunner
fromdirbot.spiders.dmozimportDmozSpider
classMyCrawlerRunner(CrawlerRunner):
"""
Crawlerobjectthatcollectsitemsandreturnsoutputafterfinishingcrawl.
"""
defcrawl(self,crawler_or_spidercls,*args,**kwargs):
#keepallitemsscraped
self.items=[]
#createcrawler(SameasinbaseCrawlerProcess)
crawler=self.create_crawler(crawler_or_spidercls)
#handleeachitemscraped
crawler.signals.connect(self.item_scraped,signals.item_scraped)
#createTwisted.Deferredlaunchingcrawl
dfd=self._crawl(crawler,*args,**kwargs)
#addcallback-whencrawlisdonecalreturn_items
dfd.addCallback(self.return_items)
returndfd
defitem_scraped(self,item,response,spider):
self.items.append(item)
defreturn_items(self,result):
returnself.items
defreturn_spider_output(output):
"""
:paramoutput:itemsscrapedbyCrawlerRunner
:return:jsonwithlistofitems
"""
#thisjustturnsitemsintodictionaries
#youmaywanttouseScrapyJSONserializerhere
returnjson.dumps([dict(item)foriteminoutput])
@route("/")
defschedule(request):
runner=MyCrawlerRunner()
spider=DmozSpider()
deferred=runner.crawl(spider)
deferred.addCallback(return_spider_output)
returndeferred
run("localhost",8080)
3使用ScrapyRT
安装ScrapyRT,然后启动:
>scrapyrt
文章来源:https://stackoverflow.com/questions/36384286/how-to-integrate-flask-scrapy