python实现多线程网页下载器
本文为大家分享了python实现的一个多线程网页下载器,供大家参考,具体内容如下
这是一个有着真实需求的实现,我的用途是拿它来通过HTTP方式向服务器提交游戏数据。把它放上来也是想大家帮忙挑刺,找找bug,让它工作得更好。
keywords:python,http,multi-threads,thread,threading,httplib,urllib,urllib2,Queue,httppool,httppool
废话少说,上源码:
#-*-coding:utf-8-*-
importurllib,httplib
importthread
importtime
fromQueueimportQueue,Empty,Full
HEADERS={"Content-type":"application/x-www-form-urlencoded",
'Accept-Language':'zh-cn',
'User-Agent':'Mozilla/4.0(compatible;MSIE6.0;WindowsNT5.0)',
"Accept":"text/plain"}
UNEXPECTED_ERROR=-1
POST='POST'
GET='GET'
defbase_log(msg):
printmsg
defbase_fail_op(task,status,log):
log('failop.task=%s,status=%d'%(str(task),status))
defget_remote_data(tasks,results,fail_op=base_fail_op,log=base_log):
whileTrue:
task=tasks.get()
try:
tid=task['id']
hpt=task['conn_args']#hpt<=host:port,timeout
exceptKeyError,e:
log(str(e))
continue
log('thread_%sdoingtask%d'%(thread.get_ident(),tid))
#log('hpt='+str(hpt))
conn=httplib.HTTPConnection(**hpt)
try:
params=task['params']
exceptKeyError,e:
params={}
params=urllib.urlencode(params)
#log('params='+params)
try:
method=task['method']
exceptKeyError:
method='GET'
#log('method='+method)
try:
url=task['url']
exceptKeyError:
url='/'
#log('url='+url)
headers=HEADERS
try:
tmp=task['headers']
exceptKeyError,e:
tmp={}
headers.update(tmp)
#log('headers='+str(headers))
headers['Content-Length']=len(params)
try:
ifmethod==POST:
conn.request(method,url,params,headers)
else:
conn.request(method,url+params)
response=conn.getresponse()
exceptException,e:
log('requestfailed.method=%s,url=%s,params=%sheaders=%s'%(
method,url,params,headers))
log(str(e))
fail_op(task,UNEXPECTED_ERROR,log)
continue
ifresponse.status!=httplib.OK:
fail_op(task,response.status,log)
continue
data=response.read()
results.put((tid,data),True)
classHttpPool(object):
def__init__(self,threads_count,fail_op,log):
self._tasks=Queue()
self._results=Queue()
foriinxrange(threads_count):
thread.start_new_thread(get_remote_data,
(self._tasks,self._results,fail_op,log))
defadd_task(self,tid,host,url,params,headers={},method='GET',timeout=None):
task={
'id':tid,
'conn_args':{'host':host}iftimeoutisNoneelse{'host':host,'timeout':timeout},
'headers':headers,
'url':url,
'params':params,
'method':method,
}
try:
self._tasks.put_nowait(task)
exceptFull:
returnFalse
returnTrue
defget_results(self):
results=[]
whileTrue:
try:
res=self._results.get_nowait()
exceptEmpty:
break
results.append(res)
returnresults
deftest_google(task_count,threads_count):
hp=HttpPool(threads_count,base_fail_op,base_log)
foriinxrange(task_count):
ifhp.add_task(i,
'www.google.cn',
'/search?',
{'q':'lai'},
#method='POST'
):
print'addtasksuccessed.'
whileTrue:
results=hp.get_results()
ifnotresults:
time.sleep(1.0*random.random())
foriinresults:
printi[0],len(i[1])
#printunicode(i[1],'gb18030')
if__name__=='__main__':
importsys,random
task_count,threads_count=int(sys.argv[1]),int(sys.argv[2])
test_google(task_count,threads_count)
有兴趣想尝试运行的朋友,可以把它保存为xxxx.py,然后执行pythonxxxx.py104,其中10表示向google.cn请求10次查询,4表示由4条线程来执行这些任务。
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。