python实现多线程网页下载器
本文为大家分享了python实现的一个多线程网页下载器,供大家参考,具体内容如下
这是一个有着真实需求的实现,我的用途是拿它来通过HTTP方式向服务器提交游戏数据。把它放上来也是想大家帮忙挑刺,找找bug,让它工作得更好。
keywords:python,http,multi-threads,thread,threading,httplib,urllib,urllib2,Queue,httppool,httppool
废话少说,上源码:
#-*-coding:utf-8-*- importurllib,httplib importthread importtime fromQueueimportQueue,Empty,Full HEADERS={"Content-type":"application/x-www-form-urlencoded", 'Accept-Language':'zh-cn', 'User-Agent':'Mozilla/4.0(compatible;MSIE6.0;WindowsNT5.0)', "Accept":"text/plain"} UNEXPECTED_ERROR=-1 POST='POST' GET='GET' defbase_log(msg): printmsg defbase_fail_op(task,status,log): log('failop.task=%s,status=%d'%(str(task),status)) defget_remote_data(tasks,results,fail_op=base_fail_op,log=base_log): whileTrue: task=tasks.get() try: tid=task['id'] hpt=task['conn_args']#hpt<=host:port,timeout exceptKeyError,e: log(str(e)) continue log('thread_%sdoingtask%d'%(thread.get_ident(),tid)) #log('hpt='+str(hpt)) conn=httplib.HTTPConnection(**hpt) try: params=task['params'] exceptKeyError,e: params={} params=urllib.urlencode(params) #log('params='+params) try: method=task['method'] exceptKeyError: method='GET' #log('method='+method) try: url=task['url'] exceptKeyError: url='/' #log('url='+url) headers=HEADERS try: tmp=task['headers'] exceptKeyError,e: tmp={} headers.update(tmp) #log('headers='+str(headers)) headers['Content-Length']=len(params) try: ifmethod==POST: conn.request(method,url,params,headers) else: conn.request(method,url+params) response=conn.getresponse() exceptException,e: log('requestfailed.method=%s,url=%s,params=%sheaders=%s'%( method,url,params,headers)) log(str(e)) fail_op(task,UNEXPECTED_ERROR,log) continue ifresponse.status!=httplib.OK: fail_op(task,response.status,log) continue data=response.read() results.put((tid,data),True) classHttpPool(object): def__init__(self,threads_count,fail_op,log): self._tasks=Queue() self._results=Queue() foriinxrange(threads_count): thread.start_new_thread(get_remote_data, (self._tasks,self._results,fail_op,log)) defadd_task(self,tid,host,url,params,headers={},method='GET',timeout=None): task={ 'id':tid, 'conn_args':{'host':host}iftimeoutisNoneelse{'host':host,'timeout':timeout}, 'headers':headers, 'url':url, 'params':params, 'method':method, } try: self._tasks.put_nowait(task) exceptFull: returnFalse returnTrue defget_results(self): results=[] whileTrue: try: res=self._results.get_nowait() exceptEmpty: break results.append(res) returnresults deftest_google(task_count,threads_count): hp=HttpPool(threads_count,base_fail_op,base_log) foriinxrange(task_count): ifhp.add_task(i, 'www.google.cn', '/search?', {'q':'lai'}, #method='POST' ): print'addtasksuccessed.' whileTrue: results=hp.get_results() ifnotresults: time.sleep(1.0*random.random()) foriinresults: printi[0],len(i[1]) #printunicode(i[1],'gb18030') if__name__=='__main__': importsys,random task_count,threads_count=int(sys.argv[1]),int(sys.argv[2]) test_google(task_count,threads_count)
有兴趣想尝试运行的朋友,可以把它保存为xxxx.py,然后执行pythonxxxx.py104,其中10表示向google.cn请求10次查询,4表示由4条线程来执行这些任务。
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。