Python urllib、urllib2、httplib抓取网页代码实例
使用urllib2,太强大了
试了下用代理登陆拉取cookie,跳转抓图片......
文档:http://docs.python.org/library/urllib2.html
直接上demo代码了
包括:直接拉取,使用Reuqest(post/get),使用代理,cookie,跳转处理
#!/usr/bin/python #-*-coding:utf-8-*- #urllib2_test.py #author:wklken #2012-03-17wklken@yeah.net importurllib,urllib2,cookielib,socket url="http://www.testurl....."#changeyourself #最简单方式 defuse_urllib2(): try: f=urllib2.urlopen(url,timeout=5).read() excepturllib2.URLError,e: printe.reason printlen(f) #使用Request defget_request(): #可以设置超时 socket.setdefaulttimeout(5) #可以加入参数[无参数,使用get,以下这种方式,使用post] params={"wd":"a","b":"2"} #可以加入请求头信息,以便识别 i_headers={"User-Agent":"Mozilla/5.0(Windows;U;WindowsNT5.1;zh-CN;rv:1.9.1)Gecko/20090624Firefox/3.5", "Accept":"text/plain"} #usepost,havesomeparamsposttoserver,ifnotsupport,willthrowexception #req=urllib2.Request(url,data=urllib.urlencode(params),headers=i_headers) req=urllib2.Request(url,headers=i_headers) #创建request后,还可以进行其他添加,若是key重复,后者生效 #request.add_header('Accept','application/json') #可以指定提交方式 #request.get_method=lambda:'PUT' try: page=urllib2.urlopen(req) printlen(page.read()) #likeget #url_params=urllib.urlencode({"a":"1","b":"2"}) #final_url=url+"?"+url_params #printfinal_url #data=urllib2.urlopen(final_url).read() #print"Method:get",len(data) excepturllib2.HTTPError,e: print"ErrorCode:",e.code excepturllib2.URLError,e: print"ErrorReason:",e.reason defuse_proxy(): enable_proxy=False proxy_handler=urllib2.ProxyHandler({"http":"http://proxyurlXXXX.com:8080"}) null_proxy_handler=urllib2.ProxyHandler({}) ifenable_proxy: opener=urllib2.build_opener(proxy_handler,urllib2.HTTPHandler) else: opener=urllib2.build_opener(null_proxy_handler,urllib2.HTTPHandler) #此句设置urllib2的全局opener urllib2.install_opener(opener) content=urllib2.urlopen(url).read() print"proxylen:",len(content) classNoExceptionCookieProcesser(urllib2.HTTPCookieProcessor): defhttp_error_403(self,req,fp,code,msg,hdrs): returnfp defhttp_error_400(self,req,fp,code,msg,hdrs): returnfp defhttp_error_500(self,req,fp,code,msg,hdrs): returnfp defhand_cookie(): cookie=cookielib.CookieJar() #cookie_handler=urllib2.HTTPCookieProcessor(cookie) #afteradderrorexceptionhandler cookie_handler=NoExceptionCookieProcesser(cookie) opener=urllib2.build_opener(cookie_handler,urllib2.HTTPHandler) url_login="https://www.yourwebsite/?login" params={"username":"user","password":"111111"} opener.open(url_login,urllib.urlencode(params)) foritemincookie: printitem.name,item.value #urllib2.install_opener(opener) #content=urllib2.urlopen(url).read() #printlen(content) #得到重定向N次以后最后页面URL defget_request_direct(): importhttplib httplib.HTTPConnection.debuglevel=1 request=urllib2.Request("http://www.google.com") request.add_header("Accept","text/html,*/*") request.add_header("Connection","Keep-Alive") opener=urllib2.build_opener() f=opener.open(request) printf.url printf.headers.dict printlen(f.read()) if__name__=="__main__": use_urllib2() get_request() get_request_direct() use_proxy() hand_cookie()