Python实现多线程抓取妹子图
心血来潮写了个多线程抓妹子图,虽然代码还是有一些瑕疵,但是还是记录下来,分享给大家。
Pic_downloader.py
#-*-coding:utf-8-*-
"""
CreatedonFriAug0717:30:582015
@author:Dreace
"""
importurllib2
importsys
importtime
importos
importrandom
frommultiprocessing.dummyimportPoolasThreadPool
type_=sys.getfilesystemencoding()
defrename():
returntime.strftime("%Y%m%d%H%M%S")
defrename_2(name):
iflen(name)==2:
name='0'+name+'.jpg'
eliflen(name)==1:
name='00'+name+'.jpg'
else:
name=name+'.jpg'
returnname
defdownload_pic(i):
globalcount
globaltime_out
ifFilter(i):
try:
content=urllib2.urlopen(i,timeout=time_out)
url_content=content.read()
f=open(repr(random.randint(10000,999999999))+"_"+rename_2(repr(count)),"wb")
f.write(url_content)
f.close()
count+=1
exceptException,e:
printi+"下载超时,跳过!".decode("utf-8").encode(type_)
defFilter(content):
forlineinFilter_list:
line=line.strip('\n')
ifcontent.find(line)==-1:
returnTrue
defget_pic(url_address):
globalpic_list
try:
str_=urllib2.urlopen(url_address,timeout=time_out).read()
url_content=str_.split("\"")
foriinurl_content:
ifi.find(".jpg")!=-1:
pic_list.append(i)
exceptException,e:
print"获取图片超时,跳过!".decode("utf-8").encode(type_)
MAX=2
count=0
time_out=60
thread_num=30
pic_list=[]
page_list=[]
Filter_list=["imgsize.ph.126.net","img.ph.126.net","img2.ph.126.net"]
dir_name="C:\Photos\\"+rename()
os.makedirs(dir_name)
os.chdir(dir_name)
start_time=time.time()
url_address="http://sexy.faceks.com/?page="
foriinrange(1,MAX+1):
page_list.append(url_address+repr(i))
page_pool=ThreadPool(thread_num)
page_pool.map(get_pic,page_list)
print"获取到".decode("utf-8").encode(type_),len(pic_list),"张图片,开始下载!".decode("utf-8").encode(type_)
pool=ThreadPool(thread_num)
pool.map(download_pic,pic_list)
pool.close()
pool.join()
printcount,"张图片保存在".decode("utf-8").encode(type_)+dir_name
print"共耗时".decode("utf-8").encode(type_),time.time()-start_time,"s"
我们来看下一个网友的作品
#coding:utf-8#############################################################
#FileName:main.py
#Author:mylonly
#mail:mylonly@gmail.com
#CreatedTime:Wed11Jun201408:22:12PMCST
#########################################################################
#!/usr/bin/python
importre,urllib2,HTMLParser,threading,Queue,time
#各图集入口链接
htmlDoorList=[]
#包含图片的Hmtl链接
htmlUrlList=[]
#图片Url链接Queue
imageUrlList=Queue.Queue(0)
#捕获图片数量
imageGetCount=0
#已下载图片数量
imageDownloadCount=0
#每个图集的起始地址,用于判断终止
nextHtmlUrl=''
#本地保存路径
localSavePath='/data/1920x1080/'
#如果你想下你需要的分辨率的,请修改replace_str,有如下分辨率可供选择1920x1200,1980x1920,1680x1050,1600x900,1440x900,1366x768,1280x1024,1024x768,1280x800
replace_str='1920x1080'
replaced_str='960x600'
#内页分析处理类
classImageHtmlParser(HTMLParser.HTMLParser):
def__init__(self):
self.nextUrl=''
HTMLParser.HTMLParser.__init__(self)
defhandle_starttag(self,tag,attrs):
globalimageUrlList
if(tag=='img'andlen(attrs)>2):
if(attrs[0]==('id','bigImg')):
url=attrs[1][1]
url=url.replace(replaced_str,replace_str)
imageUrlList.put(url)
globalimageGetCount
imageGetCount=imageGetCount+1
printurl
elif(tag=='a'andlen(attrs)==4):
if(attrs[0]==('id','pageNext')andattrs[1]==('class','next')):
globalnextHtmlUrl
nextHtmlUrl=attrs[2][1];
#首页分析类
classIndexHtmlParser(HTMLParser.HTMLParser):
def__init__(self):
self.urlList=[]
self.index=0
self.nextUrl=''
self.tagList=['li','a']
self.classList=['photo-list-padding','pic']
HTMLParser.HTMLParser.__init__(self)
defhandle_starttag(self,tag,attrs):
if(tag==self.tagList[self.index]):
forattrinattrs:
if(attr[1]==self.classList[self.index]):
if(self.index==0):
#第一层找到了
self.index=1
else:
#第二层找到了
self.index=0
printattrs[1][1]
self.urlList.append(attrs[1][1])
break
elif(tag=='a'):
forattrinattrs:
if(attr[0]=='id'andattr[1]=='pageNext'):
self.nextUrl=attrs[1][1]
print'nextUrl:',self.nextUrl
break
#首页Hmtl解析器
indexParser=IndexHtmlParser()
#内页Html解析器
imageParser=ImageHtmlParser()
#根据首页得到所有入口链接
print'开始扫描首页...'
host='http://desk.zol.com.cn'
indexUrl='/meinv/'
while(indexUrl!=''):
print'正在抓取网页:',host+indexUrl
request=urllib2.Request(host+indexUrl)
try:
m=urllib2.urlopen(request)
con=m.read()
indexParser.feed(con)
if(indexUrl==indexParser.nextUrl):
break
else:
indexUrl=indexParser.nextUrl
excepturllib2.URLError,e:
printe.reason
print'首页扫描完成,所有图集链接已获得:'
htmlDoorList=indexParser.urlList
#根据入口链接得到所有图片的url
classgetImageUrl(threading.Thread):
def__init__(self):
threading.Thread.__init__(self)
defrun(self):
fordoorinhtmlDoorList:
print'开始获取图片地址,入口地址为:',door
globalnextHtmlUrl
nextHtmlUrl=''
while(door!=''):
print'开始从网页%s获取图片...'%(host+door)
if(nextHtmlUrl!=''):
request=urllib2.Request(host+nextHtmlUrl)
else:
request=urllib2.Request(host+door)
try:
m=urllib2.urlopen(request)
con=m.read()
imageParser.feed(con)
print'下一个页面地址为:',nextHtmlUrl
if(door==nextHtmlUrl):
break
excepturllib2.URLError,e:
printe.reason
print'所有图片地址均已获得:',imageUrlList
classgetImage(threading.Thread):
def__init__(self):
threading.Thread.__init__(self)
defrun(self):
globalimageUrlList
print'开始下载图片...'
while(True):
print'目前捕获图片数量:',imageGetCount
print'已下载图片数量:',imageDownloadCount
image=imageUrlList.get()
print'下载文件路径:',image
try:
cont=urllib2.urlopen(image).read()
patter='[0-9]*\.jpg';
match=re.search(patter,image);
ifmatch:
print'正在下载文件:',match.group()
filename=localSavePath+match.group()
f=open(filename,'wb')
f.write(cont)
f.close()
globalimageDownloadCount
imageDownloadCount=imageDownloadCount+1
else:
print'nomatch'
if(imageUrlList.empty()):
break
excepturllib2.URLError,e:
printe.reason
print'文件全部下载完成...'
get=getImageUrl()
get.start()
print'获取图片链接线程启动:'
time.sleep(2)
download=getImage()
download.start()
print'下载图片链接线程启动:'
批量抓取指定网页上的所有图片
#-*-coding:utf-8-*-
#coding=UTF-8
importos,urllib,urllib2,re
url=u"http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=python&oq=python&rsp=-1"
outpath="t:\\"
defgetHtml(url):
webfile=urllib.urlopen(url)
outhtml=webfile.read()
printouthtml
returnouthtml
defgetImageList(html):
restr=ur'('
restr+=ur'http:\/\/[^\s,"]*\.jpg'
restr+=ur'|http:\/\/[^\s,"]*\.jpeg'
restr+=ur'|http:\/\/[^\s,"]*\.png'
restr+=ur'|http:\/\/[^\s,"]*\.gif'
restr+=ur'|http:\/\/[^\s,"]*\.bmp'
restr+=ur'|https:\/\/[^\s,"]*\.jpeg'
restr+=ur'|https:\/\/[^\s,"]*\.jpeg'
restr+=ur'|https:\/\/[^\s,"]*\.png'
restr+=ur'|https:\/\/[^\s,"]*\.gif'
restr+=ur'|https:\/\/[^\s,"]*\.bmp'
restr+=ur')'
htmlurl=re.compile(restr)
imgList=re.findall(htmlurl,html)
printimgList
returnimgList
defdownload(imgList,page):
x=1
forimgurlinimgList:
filepathname=str(outpath+'pic_%09d_%010d'%(page,x)+str(os.path.splitext(urllib2.unquote(imgurl).decode('utf8').split('/')[-1])[1])).lower()
print'[Debug]Downloadfile:'+imgurl+'>>'+filepathname
urllib.urlretrieve(imgurl,filepathname)
x+=1
defdownImageNum(pagenum):
page=1
pageNumber=pagenum
while(page<=pageNumber):
html=getHtml(url)#获得url指向的html内容
imageList=getImageList(html)#获得所有图片的地址,返回列表
download(imageList,page)#下载所有的图片
page=page+1
if__name__=='__main__':
downImageNum(1)
以上就是给大家汇总的3款Python实现的批量抓取妹纸图片的代码了,希望对大家学习Python爬虫能够有所帮助。