python实现多线程采集的2个代码例子
代码一:
#!/usr/bin/python
#-*-coding:utf-8-*-
#encoding=utf-8
importthreading
importQueue
importsys
importurllib2
importre
importMySQLdb
#
#数据库变量设置
#
DB_HOST='127.0.0.1'
DB_USER="XXXX"
DB_PASSWD="XXXXXXXX"
DB_NAME="xxxx"
#
#变量设置
#
THREAD_LIMIT=3
jobs=Queue.Queue(5)
singlelock=threading.Lock()
info=Queue.Queue()
defworkerbee(inputlist):
forxinxrange(THREAD_LIMIT):
print'Thead{0}started.'.format(x)
t=spider()
t.start()
foriininputlist:
try:
jobs.put(i,block=True,timeout=5)
except:
singlelock.acquire()
print"Thequeueisfull!"
singlelock.release()
#Waitforthethreadstofinish
singlelock.acquire() #Acquirethelocksowecanprint
print"Waitingforthreadstofinish."
singlelock.release() #Releasethelock
jobs.join() #Thiscommandwaitsforallthreadstofinish.
#whilenotjobs.empty():
# printjobs.get()
defgetTitle(url,time=10):
response=urllib2.urlopen(url,timeout=time)
html=response.read()
response.close()
reg=r'<title>(.*?)</title>'
title=re.compile(reg).findall(html)
#title=title[0].decode('gb2312','replace').encode('utf-8')
title=title[0]
returntitle
classspider(threading.Thread):
defrun(self):
while1:
try:
job=jobs.get(True,1)
singlelock.acquire()
title=getTitle(job[1])
info.put([job[0],title],block=True,timeout=5)
#print'This{0}is{1}'.format(job[1],title)
singlelock.release()
jobs.task_done()
except:
break;
if__name__=='__main__':
con=None
urls=[]
try:
con=MySQLdb.connect(DB_HOST,DB_USER,DB_PASSWD,DB_NAME)
cur=con.cursor()
cur.execute('SELECTid,urlFROM`table_name`WHERE`status`=0LIMIT10')
rows=cur.fetchall()
forrowinrows:
#printrow
urls.append([row[0],row[1]])
workerbee(urls)
whilenotinfo.empty():
printinfo.get()
finally:
ifcon:
con.close()
代码二:
#!/usr/bin/python
#-*-coding:utf-8-*-
#encoding=utf-8
#Filename:robot.py
importthreading,Queue,sys,urllib2,re
#
#变量设置
#
THREAD_LIMIT=3#设置线程数
jobs=Queue.Queue(5)#设置队列长度
singlelock=threading.Lock()#设置一个线程锁,避免重复调用
urls=['http://games.sina.com.cn/w/n/2013-04-28/1634703505.shtml','http://games.sina.com.cn/w/n/2013-04-28/1246703487.shtml','http://games.sina.com.cn/w/n/2013-04-28/1028703471.shtml','http://games.sina.com.cn/w/n/2013-04-27/1015703426.shtml','http://games.sina.com.cn/w/n/2013-04-26/1554703373.shtml','http://games.sina.com.cn/w/n/2013-04-26/1512703346.shtml','http://games.sina.com.cn/w/n/2013-04-26/1453703334.shtml','http://games.sina.com.cn/w/n/2013-04-26/1451703333.shtml','http://games.sina.com.cn/w/n/2013-04-26/1445703329.shtml','http://games.sina.com.cn/w/n/2013-04-26/1434703322.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703321.shtml','http://games.sina.com.cn/w/n/2013-04-26/1433703320.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703318.shtml','http://games.sina.com.cn/w/n/2013-04-26/1429703317.shtml','http://games.sina.com.cn/w/n/2013-04-26/1409703297.shtml','http://games.sina.com.cn/w/n/2013-04-26/1406703296.shtml','http://games.sina.com.cn/w/n/2013-04-26/1402703292.shtml','http://games.sina.com.cn/w/n/2013-04-26/1353703286.shtml','http://games.sina.com.cn/w/n/2013-04-26/1348703284.shtml','http://games.sina.com.cn/w/n/2013-04-26/1327703275.shtml','http://games.sina.com.cn/w/n/2013-04-26/1239703265.shtml','http://games.sina.com.cn/w/n/2013-04-26/1238703264.shtml','http://games.sina.com.cn/w/n/2013-04-26/1231703262.shtml','http://games.sina.com.cn/w/n/2013-04-26/1229703261.shtml','http://games.sina.com.cn/w/n/2013-04-26/1228703260.shtml','http://games.sina.com.cn/w/n/2013-04-26/1223703259.shtml','http://games.sina.com.cn/w/n/2013-04-26/1218703258.shtml','http://games.sina.com.cn/w/n/2013-04-26/1202703254.shtml','http://games.sina.com.cn/w/n/2013-04-26/1159703251.shtml','http://games.sina.com.cn/w/n/2013-04-26/1139703233.shtml']
defworkerbee(inputlist):
forxinxrange(THREAD_LIMIT):
print'Thead{0}started.'.format(x)
t=spider()
t.start()
foriininputlist:
try:
jobs.put(i,block=True,timeout=5)
except:
singlelock.acquire()
print"Thequeueisfull!"
singlelock.release()
#Waitforthethreadstofinish
singlelock.acquire()#Acquirethelocksowecanprint
print"Waitingforthreadstofinish."
singlelock.release()#Releasethelock
jobs.join()#Thiscommandwaitsforallthreadstofinish.
#whilenotjobs.empty():
#printjobs.get()
defgetTitle(url,time=10):
response=urllib2.urlopen(url,timeout=time)
html=response.read()
response.close()
reg=r'<title>(.*?)</title>'
title=re.compile(reg).findall(html)
title=title[0].decode('gb2312','replace').encode('utf-8')
returntitle
classspider(threading.Thread):
defrun(self):
while1:
try:
job=jobs.get(True,1)
singlelock.acquire()
title=getTitle(job)
print'This{0}is{1}'.format(job,title)
singlelock.release()
jobs.task_done()
except:
break;
if__name__=='__main__':
workerbee(urls)