Python urllib、urllib2、httplib抓取网页代码实例
使用urllib2,太强大了
试了下用代理登陆拉取cookie,跳转抓图片......
文档:http://docs.python.org/library/urllib2.html
直接上demo代码了
包括:直接拉取,使用Reuqest(post/get),使用代理,cookie,跳转处理
#!/usr/bin/python
#-*-coding:utf-8-*-
#urllib2_test.py
#author:wklken
#2012-03-17wklken@yeah.net
importurllib,urllib2,cookielib,socket
url="http://www.testurl....."#changeyourself
#最简单方式
defuse_urllib2():
try:
f=urllib2.urlopen(url,timeout=5).read()
excepturllib2.URLError,e:
printe.reason
printlen(f)
#使用Request
defget_request():
#可以设置超时
socket.setdefaulttimeout(5)
#可以加入参数[无参数,使用get,以下这种方式,使用post]
params={"wd":"a","b":"2"}
#可以加入请求头信息,以便识别
i_headers={"User-Agent":"Mozilla/5.0(Windows;U;WindowsNT5.1;zh-CN;rv:1.9.1)Gecko/20090624Firefox/3.5",
"Accept":"text/plain"}
#usepost,havesomeparamsposttoserver,ifnotsupport,willthrowexception
#req=urllib2.Request(url,data=urllib.urlencode(params),headers=i_headers)
req=urllib2.Request(url,headers=i_headers)
#创建request后,还可以进行其他添加,若是key重复,后者生效
#request.add_header('Accept','application/json')
#可以指定提交方式
#request.get_method=lambda:'PUT'
try:
page=urllib2.urlopen(req)
printlen(page.read())
#likeget
#url_params=urllib.urlencode({"a":"1","b":"2"})
#final_url=url+"?"+url_params
#printfinal_url
#data=urllib2.urlopen(final_url).read()
#print"Method:get",len(data)
excepturllib2.HTTPError,e:
print"ErrorCode:",e.code
excepturllib2.URLError,e:
print"ErrorReason:",e.reason
defuse_proxy():
enable_proxy=False
proxy_handler=urllib2.ProxyHandler({"http":"http://proxyurlXXXX.com:8080"})
null_proxy_handler=urllib2.ProxyHandler({})
ifenable_proxy:
opener=urllib2.build_opener(proxy_handler,urllib2.HTTPHandler)
else:
opener=urllib2.build_opener(null_proxy_handler,urllib2.HTTPHandler)
#此句设置urllib2的全局opener
urllib2.install_opener(opener)
content=urllib2.urlopen(url).read()
print"proxylen:",len(content)
classNoExceptionCookieProcesser(urllib2.HTTPCookieProcessor):
defhttp_error_403(self,req,fp,code,msg,hdrs):
returnfp
defhttp_error_400(self,req,fp,code,msg,hdrs):
returnfp
defhttp_error_500(self,req,fp,code,msg,hdrs):
returnfp
defhand_cookie():
cookie=cookielib.CookieJar()
#cookie_handler=urllib2.HTTPCookieProcessor(cookie)
#afteradderrorexceptionhandler
cookie_handler=NoExceptionCookieProcesser(cookie)
opener=urllib2.build_opener(cookie_handler,urllib2.HTTPHandler)
url_login="https://www.yourwebsite/?login"
params={"username":"user","password":"111111"}
opener.open(url_login,urllib.urlencode(params))
foritemincookie:
printitem.name,item.value
#urllib2.install_opener(opener)
#content=urllib2.urlopen(url).read()
#printlen(content)
#得到重定向N次以后最后页面URL
defget_request_direct():
importhttplib
httplib.HTTPConnection.debuglevel=1
request=urllib2.Request("http://www.google.com")
request.add_header("Accept","text/html,*/*")
request.add_header("Connection","Keep-Alive")
opener=urllib2.build_opener()
f=opener.open(request)
printf.url
printf.headers.dict
printlen(f.read())
if__name__=="__main__":
use_urllib2()
get_request()
get_request_direct()
use_proxy()
hand_cookie()