python抓取网页中图片并保存到本地
在上篇文章给大家分享PHP源码批量抓取远程网页图片并保存到本地的实现方法,感兴趣的朋友可以点击了解详情。
#-*-coding:utf-8-*-
importos
importuuid
importurllib2
importcookielib
'''获取文件后缀名'''
defget_file_extension(file):
returnos.path.splitext(file)[1]
'''創建文件目录,并返回该目录'''
defmkdir(path):
#去除左右两边的空格
path=path.strip()
#去除尾部\符号
path=path.rstrip("\\")
ifnotos.path.exists(path):
os.makedirs(path)
returnpath
'''自动生成一个唯一的字符串,固定长度为36'''
defunique_str():
returnstr(uuid.uuid1())
'''
抓取网页文件内容,保存到内存
@url欲抓取文件,path+filename
'''
defget_file(url):
try:
cj=cookielib.LWPCookieJar()
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
req=urllib2.Request(url)
operate=opener.open(req)
data=operate.read()
returndata
exceptBaseException,e:
printe
returnNone
'''
保存文件到本地
@path本地路径
@file_name文件名
@data文件内容
'''
defsave_file(path,file_name,data):
ifdata==None:
return
mkdir(path)
if(notpath.endswith("/")):
path=path+"/"
file=open(path+file_name,"wb")
file.write(data)
file.flush()
file.close()
#获取文件后缀名
printget_file_extension("123.jpg");
#創建文件目录,并返回该目录
#printmkdir("d:/ljq")
#自动生成一个唯一的字符串,固定长度为36
printunique_str()
url="http://qlogo1.store.qq.com/qzone/416501600/416501600/100?0";
save_file("d:/ljq/","123.jpg",get_file(url))
通过Python抓取指定Url中的图片保存至本地
#***encoding:utf-8***
__author__='jiangyt'
"""
fetchimagesfromspecificurl
v1.0
"""
importurllib,httplib,urlparse
importre
importrandom
"""judgeurlexistsornot"""
defhttpExists(url):
host,path=urlparse.urlsplit(url)[1:3]
if':'inhost:
#portspecified,trytouseit
host,port=host.split(':',1)
try:
port=int(port)
exceptValueError:
print'invalidportnumber%r'%(port,)
returnFalse
else:
#noportspecified,usedefaultport
port=None
try:
connection=httplib.HTTPConnection(host,port=port)
connection.request("HEAD",path)
resp=connection.getresponse()
ifresp.status==200:#normal'found'status
found=True
elifresp.status==302:#recurseontemporaryredirect
found=httpExists(urlparse.urljoin(url,resp.getheader('location','')))
else:#everythingelse->notfound
print"Status%d%s:%s"%(resp.status,resp.reason,url)
found=False
exceptException,e:
printe.__class__,e,url
found=False
returnfound
"""gethtmlsrc,returnlines[]"""
defgGetHtmlLines(url):
ifurl==None:return
ifnothttpExists(url):return
try:
page=urllib.urlopen(url)
html=page.readlines()
page.close()
returnhtml
exceptException,e:
print"gGetHtmlLines()error!Exception==>>"+e
return
"""gethtmlsrc,returnstring"""
defgGetHtml(url):
ifurl==None:return
ifnothttpExists(url):return
try:
page=urllib.urlopen(url)
html=page.read()
page.close()
returnhtml
exceptException,e:
print"gGetHtml()error!Exception==>>"+e
return
"""根据url获取文件名"""
defgGetFileName(url):
ifurl==None:returnNone
ifurl=="":return""
arr=url.split("/")
returnarr[len(arr)-1]
"""生成随机文件名"""
defgRandFilename(type):
fname=''
foriinrange(16):
fname=fname+chr(random.randint(65,90))
fname=fname+chr(random.randint(48,57))
returnfname+'.'+type
"""根据url和其上的link,得到link的绝对地址"""
defgGetAbslLink(url,link):
ifurl==Noneorlink==None:return
ifurl==''orlink=='':returnurl
addr=''
iflink[0]=='/':
addr=gGetHttpAddr(url)+link
eliflen(link)>3andlink[0:4]=='http':
addr=link
eliflen(link)>2andlink[0:2]=='..':
addr=gGetHttpAddrFatherAssign(url,link)
else:
addr=gGetHttpAddrFather(url)+link
returnaddr
"""根据输入的lines,匹配正则表达式,返回list"""
defgGetRegList(linesList,regx):
iflinesList==None:return
rtnList=[]
forlineinlinesList:
matchs=re.search(regx,line,re.IGNORECASE)
ifmatchs!=None:
allGroups=matchs.groups()
forfoundStrinallGroups:
iffoundStrnotinrtnList:
rtnList.append(foundStr)
returnrtnList
"""根据url下载文件,文件名参数指定"""
defgDownloadWithFilename(url,savePath,file):
#参数检查,现忽略
try:
urlopen=urllib.URLopener()
fp=urlopen.open(url)
data=fp.read()
fp.close()
file=open(savePath+file,'w+b')
file.write(data)
file.close()
exceptIOError,error:
print"DOWNLOAD%sERROR!==>>%s"%(url,error)
exceptException,e:
print"Exception==>>"+e
"""根据url下载文件,文件名自动从url获取"""
defgDownload(url,savePath):
#参数检查,现忽略
fileName=gGetFileName(url)
#fileName=gRandFilename('jpg')
gDownloadWithFilename(url,savePath,fileName)
"""根据某网页的url,下载该网页的jpg"""
defgDownloadHtmlJpg(downloadUrl,savePath):
lines=gGetHtmlLines(downloadUrl)#'getthepagesource'
regx=r"""src\s*="?(\S+)\.jpg"""
lists=gGetRegList(lines,regx)#'getthelinkswhichmatchregularexpress'
iflists==None:return
forjpginlists:
jpg=gGetAbslLink(downloadUrl,jpg)+'.jpg'
gDownload(jpg,savePath)
printgGetFileName(jpg)
"""根据url取主站地址"""
defgGetHttpAddr(url):
ifurl=='':return''
arr=url.split("/")
returnarr[0]+"//"+arr[2]
"""根据url取上级目录"""
defgGetHttpAddrFather(url):
ifurl=='':return''
arr=url.split("/")
addr=arr[0]+'//'+arr[2]+'/'
iflen(arr)-1>3:
foriinrange(3,len(arr)-1):
addr=addr+arr[i]+'/'
returnaddr
"""根据url和上级的link取link的绝对地址"""
defgGetHttpAddrFatherAssign(url,link):
ifurl=='':return''
iflink=='':return''
linkArray=link.split("/")
urlArray=url.split("/")
partLink=''
partUrl=''
foriinrange(len(linkArray)):
iflinkArray[i]=='..':
numOfFather=i+1#上级数
else:
partLink=partLink+'/'+linkArray[i]
foriinrange(len(urlArray)-1-numOfFather):
partUrl=partUrl+urlArray[i]
ifi<len(urlArray)-1-numOfFather-1:
partUrl=partUrl+'/'
returnpartUrl+partLink
"""根据url获取其上的相关htm、html链接,返回list"""
defgGetHtmlLink(url):
#参数检查,现忽略
rtnList=[]
lines=gGetHtmlLines(url)
regx=r"""href="?(\S+)\.htm"""
forlinkingGetRegList(lines,regx):
link=gGetAbslLink(url,link)+'.htm'
iflinknotinrtnList:
rtnList.append(link)
printlink
returnrtnList
"""根据url,抓取其上的jpg和其链接htm上的jpg"""
defgDownloadAllJpg(url,savePath):
#参数检查,现忽略
gDownloadHtmlJpg(url,savePath)
#抓取link上的jpg
links=gGetHtmlLink(url)
forlinkinlinks:
gDownloadHtmlJpg(link,savePath)
"""test"""
defmain():
u='http://site.douban.com/196738/room/2462453/'#想要抓取图片的地址
save='/root/python/tmp/'#图片所要存放的目录
print'downloadpicfrom['+u+']'
print'saveto['+save+']...'
gDownloadHtmlJpg(u,save)
print"downloadfinished"
if__name__=="__main__":
main()
else:
print"calledfromintern."
以上代码是小编给大家介绍的python抓取网页中图片并保存到本地的全部内容,希望大家喜欢。