尝试使用Python多线程抓取代理服务器IP地址的示例
这里以抓取http://www.proxy.com.ru站点的代理服务器为例,代码如下:
#!/usr/bin/envpython
#coding:utf-8
importurllib2
importre
importthreading
importtime
importMySQLdb
rawProxyList=[]
checkedProxyList=[]
#抓取代理网站
targets=[]
foriinxrange(1,42):
target=r"http://www.proxy.com.ru/list_%d.html"%i
targets.append(target)
#抓取代理服务器正则
p=re.compile(r'''<tr><b><td>(\d+)</td><td>(.+?)</td><td>(\d+)</td><td>(.+?)</td><td>(.+?)</td></b></tr>''')
#获取代理的类
classProxyGet(threading.Thread):
def__init__(self,target):
threading.Thread.__init__(self)
self.target=target
defgetProxy(self):
print"代理服务器目标网站:"+self.target
req=urllib2.urlopen(self.target)
result=req.read()
#printchardet.detect(result)
matchs=p.findall(result)
#printmatchs
forrowinmatchs:
ip=row[1]
port=row[2]
addr=row[4].decode("cp936").encode("utf-8")
proxy=[ip,port,addr]
printproxy
rawProxyList.append(proxy)
defrun(self):
self.getProxy()
#检验代理的类
classProxyCheck(threading.Thread):
def__init__(self,proxyList):
threading.Thread.__init__(self)
self.proxyList=proxyList
self.timeout=5
self.testUrl="http://www.baidu.com/"
self.testStr="030173"
defcheckProxy(self):
cookies=urllib2.HTTPCookieProcessor()
forproxyinself.proxyList:
proxyHandler=urllib2.ProxyHandler({"http":r'http://%s:%s'%(proxy[0],proxy[1])})
#printr'http://%s:%s'%(proxy[0],proxy[1])
opener=urllib2.build_opener(cookies,proxyHandler)
opener.addheaders=[('User-agent','Mozilla/5.0(WindowsNT6.2;WOW64;rv:22.0)Gecko/20100101Firefox/22.0')]
#urllib2.install_opener(opener)
t1=time.time()
try:
#req=urllib2.urlopen("http://www.baidu.com",timeout=self.timeout)
req=opener.open(self.testUrl,timeout=self.timeout)
#print"urlopenisok...."
result=req.read()
#print"readhtml...."
timeused=time.time()-t1
pos=result.find(self.testStr)
#print"posis%s"%pos
ifpos>1:
checkedProxyList.append((proxy[0],proxy[1],proxy[2],timeused))
#print"okip:%s%s%s%s"%(proxy[0],proxy[1],proxy[2],timeused)
else:
continue
exceptException,e:
#printe.message
continue
defrun(self):
self.checkProxy()
if__name__=="__main__":
getThreads=[]
checkThreads=[]
#对每个目标网站开启一个线程负责抓取代理
foriinrange(len(targets)):
t=ProxyGet(targets[i])
getThreads.append(t)
foriinrange(len(getThreads)):
getThreads[i].start()
foriinrange(len(getThreads)):
getThreads[i].join()
print'.'*10+"总共抓取了%s个代理"%len(rawProxyList)+'.'*10
#开启20个线程负责校验,将抓取到的代理分成20份,每个线程校验一份
foriinrange(20):
t=ProxyCheck(rawProxyList[((len(rawProxyList)+19)/20)*i:((len(rawProxyList)+19)/20)*(i+1)])
checkThreads.append(t)
foriinrange(len(checkThreads)):
checkThreads[i].start()
foriinrange(len(checkThreads)):
checkThreads[i].join()
print'.'*10+"总共有%s个代理通过校验"%len(checkedProxyList)+'.'*10
#插入数据库,表结构自己创建,四个字段ip,port,speed,address
defdb_insert(insert_list):
try:
conn=MySQLdb.connect(host="localhost",user="root",passwd="admin",db="m_common",charset='utf8')
cursor=conn.cursor()
cursor.execute('deletefromproxy')
cursor.execute('altertableproxyAUTO_INCREMENT=1')
cursor.executemany("INSERTINTOproxy(ip,port,speed,address)VALUES(%s,%s,%s,%s)",insert_list)
conn.commit()
cursor.close()
conn.close()
exceptMySQLdb.Error,e:
print"MysqlError%d:%s"%(e.args[0],e.args[1])
#代理排序持久化
proxy_ok=[]
f=open("proxy_list.txt",'w+')
forproxyinsorted(checkedProxyList,cmp=lambdax,y:cmp(x[3],y[3])):
ifproxy[3]<8:
#print"checkedproxyis:%s:%s\t%s\t%s"%(proxy[0],proxy[1],proxy[2],proxy[3])
proxy_ok.append((proxy[0],proxy[1],proxy[3],proxy[2]))
f.write("%s:%s\t%s\t%s\n"%(proxy[0],proxy[1],proxy[2],proxy[3]))
f.close()
db_insert(proxy_ok)
测试:
pythonproxy.py
结果如下:
['61.58.94.179','8088','\xe5\x8f\xb0\xe6\xb9\xbe\xe7\x9c\x81\xe5\x8f\xb0\xe6\xb9\xbe\xe5\xae\xbd\xe9\xa2\x91\xe9\x80\x9a\xe8\xae\xaf\xe9\xa1\xbe\xe9\x97\xae\xe8\x82\xa1\xe4\xbb\xbd\xe6\x9c\x89\xe9\x99\x90\xe5\x85\xac\xe5\x8f\xb8'] ['200.84.116.99','9064','\xe5\xa7\x94\xe5\x86\x85\xe7\x91\x9e\xe6\x8b\x89'] ['183.223.204.8','8123','\xe5\x9b\x9b\xe5\xb7\x9d\xe7\x9c\x81\xe8\x87\xaa\xe8\xb4\xa1\xe5\xb8\x82\xe7\xa7\xbb\xe5\x8a\xa8'] ..........总共抓取了1921个代理.......... ..........总共有524个代理通过校验.......... #moreproxy_list.txt 202.106.169.142:80北京市联通ADSL0.291432857513 111.13.136.59:80北京市移动0.297957897186 111.13.136.56:80北京市移动0.373070955276 111.206.81.248:80北京市联通0.403017997742 111.13.136.58:80北京市移动0.414332151413 124.202.217.134:8118北京市电信通0.416817903519 124.202.183.218:8118北京市电信通0.426618099213 120.132.71.232:80北京市联通0.440200090408 61.232.6.164:8081北京市铁通0.469615936279 118.144.96.253:80北京市电信通0.485229969025 203.192.10.66:80北京市新华社0.51485991478 124.202.182.22:8118北京市电信通0.553130865097
数据库:
mysql>select*fromm_common.proxylimit10;
+----------+-----------------+------+----------+----------------------+---------------------+ |proxy_id|ip|port|speed|address|create_time| +----------+-----------------+------+----------+----------------------+---------------------+ |1|202.106.169.142|80|0.291433|北京市联通ADSL|2015-02-2611:29:24| |2|111.13.136.59|80|0.297958|北京市移动|2015-02-2611:29:24| |3|111.13.136.56|80|0.373071|北京市移动|2015-02-2611:29:24| |4|111.206.81.248|80|0.403018|北京市联通|2015-02-2611:29:24| |5|111.13.136.58|80|0.414332|北京市移动|2015-02-2611:29:24| |6|124.202.217.134|8118|0.416818|北京市电信通|2015-02-2611:29:24| |7|124.202.183.218|8118|0.426618|北京市电信通|2015-02-2611:29:24| |8|120.132.71.232|80|0.4402|北京市联通|2015-02-2611:29:24| |9|61.232.6.164|8081|0.469616|北京市铁通|2015-02-2611:29:24| |10|118.144.96.253|80|0.48523|北京市电信通|2015-02-2611:29:24| +----------+-----------------+------+----------+----------------------+---------------------+ 10rowsinset(0.00sec)