使用Python3编写抓取网页和只抓网页图片的脚本
最基本的抓取网页内容的代码实现:
#!/usr/bin/envpython fromurllibimporturlretrieve deffirstNonBlank(lines): foreachLineinlines: ifnoteachLine.strip(): continue else: returneachLine deffirstLast(webpage): f=open(webpage) lines=f.readlines() f.close() printfirstNonBlank(lines), lines.reverse() printfirstNonBlank(lines), defdownload(url='http://www',process=firstLast): try: retval=urlretrieve(url)[0] exceptIOError: retval=None ifretval: process(retval) if__name__=='__main__': download()
利用urllib模块,来实现一个网页中针对图片的抓取功能:
importurllib.request
importsocket
importre
importsys
importos
targetDir=r"C:\Users\elqstux\Desktop\pic"
defdestFile(path):
ifnotos.path.isdir(targetDir):
os.mkdir(targetDir)
pos=path.rindex('/')
t=os.path.join(targetDir,path[pos+1:])
returnt
if__name__=="__main__":
hostname="http://www.douban.com"
req=urllib.request.Request(hostname)
webpage=urllib.request.urlopen(req)
contentBytes=webpage.read()
forlink,tinset(re.findall(r'(http:[^\s]*?(jpg|png|gif))',str(contentBytes))):
print(link)
urllib.request.urlretrieve(link,destFile(link))
importurllib.request
importsocket
importre
importsys
importos
targetDir=r"H:\pic"
defdestFile(path):
ifnotos.path.isdir(targetDir):
os.mkdir(targetDir)
pos=path.rindex('/')
t=os.path.join(targetDir,path[pos+1:])#会以/作为分隔
returnt
if__name__=="__main__":
hostname="http://www.douban.com/"
req=urllib.request.Request(hostname)
webpage=urllib.request.urlopen(req)
contentBytes=webpage.read()
match=re.findall(r'(http:[^\s]*?(jpg|png|gif))',str(contentBytes))#r'(http:[^\s]*?(jpg|png|gif))'中包含两层圆括号,故有两个分组,
#上面会返回列表,括号中匹配的内容才会出现在列表中
forpicname,picTypeinmatch:
print(picname)
print(picType)
'''''
输出:
http://img3.douban.com/pics/blank.gif
gif
http://img3.douban.com/icon/g111328-1.jpg
jpg
http://img3.douban.com/pics/blank.gif
gif
http://img3.douban.com/icon/g197523-19.jpg
jpg
http://img3.douban.com/pics/blank.gif
gif
...
'''