python抓取网页中图片并保存到本地
在上篇文章给大家分享PHP源码批量抓取远程网页图片并保存到本地的实现方法,感兴趣的朋友可以点击了解详情。
#-*-coding:utf-8-*- importos importuuid importurllib2 importcookielib '''获取文件后缀名''' defget_file_extension(file): returnos.path.splitext(file)[1] '''創建文件目录,并返回该目录''' defmkdir(path): #去除左右两边的空格 path=path.strip() #去除尾部\符号 path=path.rstrip("\\") ifnotos.path.exists(path): os.makedirs(path) returnpath '''自动生成一个唯一的字符串,固定长度为36''' defunique_str(): returnstr(uuid.uuid1()) ''' 抓取网页文件内容,保存到内存 @url欲抓取文件,path+filename ''' defget_file(url): try: cj=cookielib.LWPCookieJar() opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) req=urllib2.Request(url) operate=opener.open(req) data=operate.read() returndata exceptBaseException,e: printe returnNone ''' 保存文件到本地 @path本地路径 @file_name文件名 @data文件内容 ''' defsave_file(path,file_name,data): ifdata==None: return mkdir(path) if(notpath.endswith("/")): path=path+"/" file=open(path+file_name,"wb") file.write(data) file.flush() file.close() #获取文件后缀名 printget_file_extension("123.jpg"); #創建文件目录,并返回该目录 #printmkdir("d:/ljq") #自动生成一个唯一的字符串,固定长度为36 printunique_str() url="http://qlogo1.store.qq.com/qzone/416501600/416501600/100?0"; save_file("d:/ljq/","123.jpg",get_file(url))
通过Python抓取指定Url中的图片保存至本地
#***encoding:utf-8*** __author__='jiangyt' """ fetchimagesfromspecificurl v1.0 """ importurllib,httplib,urlparse importre importrandom """judgeurlexistsornot""" defhttpExists(url): host,path=urlparse.urlsplit(url)[1:3] if':'inhost: #portspecified,trytouseit host,port=host.split(':',1) try: port=int(port) exceptValueError: print'invalidportnumber%r'%(port,) returnFalse else: #noportspecified,usedefaultport port=None try: connection=httplib.HTTPConnection(host,port=port) connection.request("HEAD",path) resp=connection.getresponse() ifresp.status==200:#normal'found'status found=True elifresp.status==302:#recurseontemporaryredirect found=httpExists(urlparse.urljoin(url,resp.getheader('location',''))) else:#everythingelse->notfound print"Status%d%s:%s"%(resp.status,resp.reason,url) found=False exceptException,e: printe.__class__,e,url found=False returnfound """gethtmlsrc,returnlines[]""" defgGetHtmlLines(url): ifurl==None:return ifnothttpExists(url):return try: page=urllib.urlopen(url) html=page.readlines() page.close() returnhtml exceptException,e: print"gGetHtmlLines()error!Exception==>>"+e return """gethtmlsrc,returnstring""" defgGetHtml(url): ifurl==None:return ifnothttpExists(url):return try: page=urllib.urlopen(url) html=page.read() page.close() returnhtml exceptException,e: print"gGetHtml()error!Exception==>>"+e return """根据url获取文件名""" defgGetFileName(url): ifurl==None:returnNone ifurl=="":return"" arr=url.split("/") returnarr[len(arr)-1] """生成随机文件名""" defgRandFilename(type): fname='' foriinrange(16): fname=fname+chr(random.randint(65,90)) fname=fname+chr(random.randint(48,57)) returnfname+'.'+type """根据url和其上的link,得到link的绝对地址""" defgGetAbslLink(url,link): ifurl==Noneorlink==None:return ifurl==''orlink=='':returnurl addr='' iflink[0]=='/': addr=gGetHttpAddr(url)+link eliflen(link)>3andlink[0:4]=='http': addr=link eliflen(link)>2andlink[0:2]=='..': addr=gGetHttpAddrFatherAssign(url,link) else: addr=gGetHttpAddrFather(url)+link returnaddr """根据输入的lines,匹配正则表达式,返回list""" defgGetRegList(linesList,regx): iflinesList==None:return rtnList=[] forlineinlinesList: matchs=re.search(regx,line,re.IGNORECASE) ifmatchs!=None: allGroups=matchs.groups() forfoundStrinallGroups: iffoundStrnotinrtnList: rtnList.append(foundStr) returnrtnList """根据url下载文件,文件名参数指定""" defgDownloadWithFilename(url,savePath,file): #参数检查,现忽略 try: urlopen=urllib.URLopener() fp=urlopen.open(url) data=fp.read() fp.close() file=open(savePath+file,'w+b') file.write(data) file.close() exceptIOError,error: print"DOWNLOAD%sERROR!==>>%s"%(url,error) exceptException,e: print"Exception==>>"+e """根据url下载文件,文件名自动从url获取""" defgDownload(url,savePath): #参数检查,现忽略 fileName=gGetFileName(url) #fileName=gRandFilename('jpg') gDownloadWithFilename(url,savePath,fileName) """根据某网页的url,下载该网页的jpg""" defgDownloadHtmlJpg(downloadUrl,savePath): lines=gGetHtmlLines(downloadUrl)#'getthepagesource' regx=r"""src\s*="?(\S+)\.jpg""" lists=gGetRegList(lines,regx)#'getthelinkswhichmatchregularexpress' iflists==None:return forjpginlists: jpg=gGetAbslLink(downloadUrl,jpg)+'.jpg' gDownload(jpg,savePath) printgGetFileName(jpg) """根据url取主站地址""" defgGetHttpAddr(url): ifurl=='':return'' arr=url.split("/") returnarr[0]+"//"+arr[2] """根据url取上级目录""" defgGetHttpAddrFather(url): ifurl=='':return'' arr=url.split("/") addr=arr[0]+'//'+arr[2]+'/' iflen(arr)-1>3: foriinrange(3,len(arr)-1): addr=addr+arr[i]+'/' returnaddr """根据url和上级的link取link的绝对地址""" defgGetHttpAddrFatherAssign(url,link): ifurl=='':return'' iflink=='':return'' linkArray=link.split("/") urlArray=url.split("/") partLink='' partUrl='' foriinrange(len(linkArray)): iflinkArray[i]=='..': numOfFather=i+1#上级数 else: partLink=partLink+'/'+linkArray[i] foriinrange(len(urlArray)-1-numOfFather): partUrl=partUrl+urlArray[i] ifi<len(urlArray)-1-numOfFather-1: partUrl=partUrl+'/' returnpartUrl+partLink """根据url获取其上的相关htm、html链接,返回list""" defgGetHtmlLink(url): #参数检查,现忽略 rtnList=[] lines=gGetHtmlLines(url) regx=r"""href="?(\S+)\.htm""" forlinkingGetRegList(lines,regx): link=gGetAbslLink(url,link)+'.htm' iflinknotinrtnList: rtnList.append(link) printlink returnrtnList """根据url,抓取其上的jpg和其链接htm上的jpg""" defgDownloadAllJpg(url,savePath): #参数检查,现忽略 gDownloadHtmlJpg(url,savePath) #抓取link上的jpg links=gGetHtmlLink(url) forlinkinlinks: gDownloadHtmlJpg(link,savePath) """test""" defmain(): u='http://site.douban.com/196738/room/2462453/'#想要抓取图片的地址 save='/root/python/tmp/'#图片所要存放的目录 print'downloadpicfrom['+u+']' print'saveto['+save+']...' gDownloadHtmlJpg(u,save) print"downloadfinished" if__name__=="__main__": main() else: print"calledfromintern."
以上代码是小编给大家介绍的python抓取网页中图片并保存到本地的全部内容,希望大家喜欢。