php与python实现的线程池多线程爬虫功能示例
本文实例讲述了php与python实现的线程池多线程爬虫功能。分享给大家供大家参考,具体如下:
多线程爬虫可以用于抓取内容了这个可以提升性能了,这里我们来看php与python线程池多线程爬虫的例子,代码如下:
php例子
<?php
classConnectextendsWorker//worker模式
{
publicfunction__construct()
{
}
publicfunctiongetConnection()
{
if(!self::$ch)
{
self::$ch=curl_init();
curl_setopt(self::$ch,CURLOPT_TIMEOUT,2);
curl_setopt(self::$ch,CURLOPT_RETURNTRANSFER,1);
curl_setopt(self::$ch,CURLOPT_HEADER,0);
curl_setopt(self::$ch,CURLOPT_NOSIGNAL,true);
curl_setopt(self::$ch,CURLOPT_USERAGENT,"Firefox");
curl_setopt(self::$ch,CURLOPT_FOLLOWLOCATION,1);
}
/*dosomeexception/errorstuffheremaybe*/
returnself::$ch;
}
publicfunctioncloseConnection()
{
curl_close(self::$ch);
}
/**
*Notethatthelinkisstoredstatically,whichforpthreads,meansthreadlocal
**/
protectedstatic$ch;
}
classQueryextendsThreaded
{
publicfunction__construct($url)
{
$this->url=$url;
}
publicfunctionrun()
{
$ch=$this->worker->getConnection();
curl_setopt($ch,CURLOPT_URL,$this->url);
$page=curl_exec($ch);
$info=curl_getinfo($ch);
$error=curl_error($ch);
$this->deal_data($this->url,$page,$info,$error);
$this->result=$page;
}
functiondeal_data($url,$page,$info,$error)
{
$parts=explode(".",$url);
$id=$parts[1];
if($info['http_code']!=200)
{
$this->show_msg($id,$error);
}else
{
$this->show_msg($id,"OK");
}
}
functionshow_msg($id,$msg)
{
echo$id."\t$msg\n";
}
publicfunctiongetResult()
{
return$this->result;
}
protected$url;
protected$result;
}
functioncheck_urls_multi_pthreads()
{
global$check_urls;//定义抓取的连接
$check_urls=array('http://xxx.com'=>"xx网",);
$pool=newPool(10,"Connect",array());//建立10个线程池
foreach($check_urlsas$url=>$name)
{
$pool->submit(newQuery($url));
}
$pool->shutdown();
}
check_urls_multi_pthreads();
python多线程
defhandle(sid)://这个方法内执行爬虫数据处理
pass
classMyThread(Thread):
"""docstringforClassName"""
def__init__(self,sid):
Thread.__init__(self)
self.sid=sid
defrun():
handle(self.sid)
threads=[]
foriinxrange(1,11):
t=MyThread(i)
threads.append(t)
t.start()
fortinthreads:
t.join()
python线程池爬虫:
fromqueueimportQueue
fromthreadingimportThread,Lock
importurllib.parse
importsocket
importre
importtime
seen_urls=set(['/'])
lock=Lock()
classFetcher(Thread):
def__init__(self,tasks):
Thread.__init__(self)
self.tasks=tasks
self.daemon=True
self.start()
defrun(self):
whileTrue:
url=self.tasks.get()
print(url)
sock=socket.socket()
sock.connect(('localhost',3000))
get='GET{}HTTP/1.0\r\nHost:localhost\r\n\r\n'.format(url)
sock.send(get.encode('ascii'))
response=b''
chunk=sock.recv(4096)
whilechunk:
response+=chunk
chunk=sock.recv(4096)
links=self.parse_links(url,response)
lock.acquire()
forlinkinlinks.difference(seen_urls):
self.tasks.put(link)
seen_urls.update(links)
lock.release()
self.tasks.task_done()
defparse_links(self,fetched_url,response):
ifnotresponse:
print('error:{}'.format(fetched_url))
returnset()
ifnotself._is_html(response):
returnset()
urls=set(re.findall(r'''(?i)href=["']?([^\s"'<>]+)''',
self.body(response)))
links=set()
forurlinurls:
normalized=urllib.parse.urljoin(fetched_url,url)
parts=urllib.parse.urlparse(normalized)
ifparts.schemenotin('','http','https'):
continue
host,port=urllib.parse.splitport(parts.netloc)
ifhostandhost.lower()notin('localhost'):
continue
defragmented,frag=urllib.parse.urldefrag(parts.path)
links.add(defragmented)
returnlinks
defbody(self,response):
body=response.split(b'\r\n\r\n',1)[1]
returnbody.decode('utf-8')
def_is_html(self,response):
head,body=response.split(b'\r\n\r\n',1)
headers=dict(h.split(':')forhinhead.decode().split('\r\n')[1:])
returnheaders.get('Content-Type','').startswith('text/html')
classThreadPool:
def__init__(self,num_threads):
self.tasks=Queue()
for_inrange(num_threads):
Fetcher(self.tasks)
defadd_task(self,url):
self.tasks.put(url)
defwait_completion(self):
self.tasks.join()
if__name__=='__main__':
start=time.time()
pool=ThreadPool(4)
pool.add_task("/")
pool.wait_completion()
print('{}URLsfetchedin{:.1f}seconds'.format(len(seen_urls),time.time()-start))
更多关于PHP相关内容感兴趣的读者可查看本站专题:《phpcurl用法总结》、《PHP数组(Array)操作技巧大全》、《php排序算法总结》、《PHP常用遍历算法与技巧总结》、《PHP数据结构与算法教程》、《php程序设计算法总结》、《PHP数学运算技巧总结》、《php正则表达式用法总结》、《PHP运算与运算符用法总结》、《php字符串(string)用法总结》及《php常见数据库操作技巧汇总》
希望本文所述对大家PHP程序设计有所帮助。