使用Python3编写抓取网页和只抓网页图片的脚本
最基本的抓取网页内容的代码实现:
#!/usr/bin/envpython fromurllibimporturlretrieve deffirstNonBlank(lines): foreachLineinlines: ifnoteachLine.strip(): continue else: returneachLine deffirstLast(webpage): f=open(webpage) lines=f.readlines() f.close() printfirstNonBlank(lines), lines.reverse() printfirstNonBlank(lines), defdownload(url='http://www',process=firstLast): try: retval=urlretrieve(url)[0] exceptIOError: retval=None ifretval: process(retval) if__name__=='__main__': download()
利用urllib模块,来实现一个网页中针对图片的抓取功能:
importurllib.request importsocket importre importsys importos targetDir=r"C:\Users\elqstux\Desktop\pic" defdestFile(path): ifnotos.path.isdir(targetDir): os.mkdir(targetDir) pos=path.rindex('/') t=os.path.join(targetDir,path[pos+1:]) returnt if__name__=="__main__": hostname="http://www.douban.com" req=urllib.request.Request(hostname) webpage=urllib.request.urlopen(req) contentBytes=webpage.read() forlink,tinset(re.findall(r'(http:[^\s]*?(jpg|png|gif))',str(contentBytes))): print(link) urllib.request.urlretrieve(link,destFile(link))
importurllib.request importsocket importre importsys importos targetDir=r"H:\pic" defdestFile(path): ifnotos.path.isdir(targetDir): os.mkdir(targetDir) pos=path.rindex('/') t=os.path.join(targetDir,path[pos+1:])#会以/作为分隔 returnt if__name__=="__main__": hostname="http://www.douban.com/" req=urllib.request.Request(hostname) webpage=urllib.request.urlopen(req) contentBytes=webpage.read() match=re.findall(r'(http:[^\s]*?(jpg|png|gif))',str(contentBytes))#r'(http:[^\s]*?(jpg|png|gif))'中包含两层圆括号,故有两个分组, #上面会返回列表,括号中匹配的内容才会出现在列表中 forpicname,picTypeinmatch: print(picname) print(picType) ''''' 输出: http://img3.douban.com/pics/blank.gif gif http://img3.douban.com/icon/g111328-1.jpg jpg http://img3.douban.com/pics/blank.gif gif http://img3.douban.com/icon/g197523-19.jpg jpg http://img3.douban.com/pics/blank.gif gif ... '''