php与python实现的线程池多线程爬虫功能示例
本文实例讲述了php与python实现的线程池多线程爬虫功能。分享给大家供大家参考,具体如下:
多线程爬虫可以用于抓取内容了这个可以提升性能了,这里我们来看php与python线程池多线程爬虫的例子,代码如下:
php例子
<?php classConnectextendsWorker//worker模式 { publicfunction__construct() { } publicfunctiongetConnection() { if(!self::$ch) { self::$ch=curl_init(); curl_setopt(self::$ch,CURLOPT_TIMEOUT,2); curl_setopt(self::$ch,CURLOPT_RETURNTRANSFER,1); curl_setopt(self::$ch,CURLOPT_HEADER,0); curl_setopt(self::$ch,CURLOPT_NOSIGNAL,true); curl_setopt(self::$ch,CURLOPT_USERAGENT,"Firefox"); curl_setopt(self::$ch,CURLOPT_FOLLOWLOCATION,1); } /*dosomeexception/errorstuffheremaybe*/ returnself::$ch; } publicfunctioncloseConnection() { curl_close(self::$ch); } /** *Notethatthelinkisstoredstatically,whichforpthreads,meansthreadlocal **/ protectedstatic$ch; } classQueryextendsThreaded { publicfunction__construct($url) { $this->url=$url; } publicfunctionrun() { $ch=$this->worker->getConnection(); curl_setopt($ch,CURLOPT_URL,$this->url); $page=curl_exec($ch); $info=curl_getinfo($ch); $error=curl_error($ch); $this->deal_data($this->url,$page,$info,$error); $this->result=$page; } functiondeal_data($url,$page,$info,$error) { $parts=explode(".",$url); $id=$parts[1]; if($info['http_code']!=200) { $this->show_msg($id,$error); }else { $this->show_msg($id,"OK"); } } functionshow_msg($id,$msg) { echo$id."\t$msg\n"; } publicfunctiongetResult() { return$this->result; } protected$url; protected$result; } functioncheck_urls_multi_pthreads() { global$check_urls;//定义抓取的连接 $check_urls=array('http://xxx.com'=>"xx网",); $pool=newPool(10,"Connect",array());//建立10个线程池 foreach($check_urlsas$url=>$name) { $pool->submit(newQuery($url)); } $pool->shutdown(); } check_urls_multi_pthreads(); python多线程 defhandle(sid)://这个方法内执行爬虫数据处理 pass classMyThread(Thread): """docstringforClassName""" def__init__(self,sid): Thread.__init__(self) self.sid=sid defrun(): handle(self.sid) threads=[] foriinxrange(1,11): t=MyThread(i) threads.append(t) t.start() fortinthreads: t.join()
python线程池爬虫:
fromqueueimportQueue fromthreadingimportThread,Lock importurllib.parse importsocket importre importtime seen_urls=set(['/']) lock=Lock() classFetcher(Thread): def__init__(self,tasks): Thread.__init__(self) self.tasks=tasks self.daemon=True self.start() defrun(self): whileTrue: url=self.tasks.get() print(url) sock=socket.socket() sock.connect(('localhost',3000)) get='GET{}HTTP/1.0\r\nHost:localhost\r\n\r\n'.format(url) sock.send(get.encode('ascii')) response=b'' chunk=sock.recv(4096) whilechunk: response+=chunk chunk=sock.recv(4096) links=self.parse_links(url,response) lock.acquire() forlinkinlinks.difference(seen_urls): self.tasks.put(link) seen_urls.update(links) lock.release() self.tasks.task_done() defparse_links(self,fetched_url,response): ifnotresponse: print('error:{}'.format(fetched_url)) returnset() ifnotself._is_html(response): returnset() urls=set(re.findall(r'''(?i)href=["']?([^\s"'<>]+)''', self.body(response))) links=set() forurlinurls: normalized=urllib.parse.urljoin(fetched_url,url) parts=urllib.parse.urlparse(normalized) ifparts.schemenotin('','http','https'): continue host,port=urllib.parse.splitport(parts.netloc) ifhostandhost.lower()notin('localhost'): continue defragmented,frag=urllib.parse.urldefrag(parts.path) links.add(defragmented) returnlinks defbody(self,response): body=response.split(b'\r\n\r\n',1)[1] returnbody.decode('utf-8') def_is_html(self,response): head,body=response.split(b'\r\n\r\n',1) headers=dict(h.split(':')forhinhead.decode().split('\r\n')[1:]) returnheaders.get('Content-Type','').startswith('text/html') classThreadPool: def__init__(self,num_threads): self.tasks=Queue() for_inrange(num_threads): Fetcher(self.tasks) defadd_task(self,url): self.tasks.put(url) defwait_completion(self): self.tasks.join() if__name__=='__main__': start=time.time() pool=ThreadPool(4) pool.add_task("/") pool.wait_completion() print('{}URLsfetchedin{:.1f}seconds'.format(len(seen_urls),time.time()-start))
更多关于PHP相关内容感兴趣的读者可查看本站专题:《phpcurl用法总结》、《PHP数组(Array)操作技巧大全》、《php排序算法总结》、《PHP常用遍历算法与技巧总结》、《PHP数据结构与算法教程》、《php程序设计算法总结》、《PHP数学运算技巧总结》、《php正则表达式用法总结》、《PHP运算与运算符用法总结》、《php字符串(string)用法总结》及《php常见数据库操作技巧汇总》
希望本文所述对大家PHP程序设计有所帮助。