python3实现抓取网页资源的 N 种方法
这两天学习了python3实现抓取网页资源的方法,发现了很多种方法,所以,今天添加一点小笔记。
1、最简单
importurllib.request
response=urllib.request.urlopen('http://python.org/')
html=response.read()
2、使用Request
importurllib.request
req=urllib.request.Request('http://python.org/')
response=urllib.request.urlopen(req)
the_page=response.read()
3、发送数据
#!/usr/bin/envpython3
importurllib.parse
importurllib.request
url='http://localhost/login.php'
user_agent='Mozilla/4.0(compatible;MSIE5.5;WindowsNT)'
values={
'act':'login',
'login[email]':'yzhang@i9i8.com',
'login[password]':'123456'
}
data=urllib.parse.urlencode(values)
req=urllib.request.Request(url,data)
req.add_header('Referer','http://www.python.org/')
response=urllib.request.urlopen(req)
the_page=response.read()
print(the_page.decode("utf8"))
4、发送数据和header
#!/usr/bin/envpython3
importurllib.parse
importurllib.request
url='http://localhost/login.php'
user_agent='Mozilla/4.0(compatible;MSIE5.5;WindowsNT)'
values={
'act':'login',
'login[email]':'yzhang@i9i8.com',
'login[password]':'123456'
}
headers={'User-Agent':user_agent}
data=urllib.parse.urlencode(values)
req=urllib.request.Request(url,data,headers)
response=urllib.request.urlopen(req)
the_page=response.read()
print(the_page.decode("utf8"))
5、http错误
#!/usr/bin/envpython3
importurllib.request
req=urllib.request.Request('http://www.python.org/fish.html')
try:
urllib.request.urlopen(req)
excepturllib.error.HTTPErrorase:
print(e.code)
print(e.read().decode("utf8"))
6、异常处理1
#!/usr/bin/envpython3
fromurllib.requestimportRequest,urlopen
fromurllib.errorimportURLError,HTTPError
req=Request("http://twitter.com/")
try:
response=urlopen(req)
exceptHTTPErrorase:
print('Theservercouldn\'tfulfilltherequest.')
print('Errorcode:',e.code)
exceptURLErrorase:
print('Wefailedtoreachaserver.')
print('Reason:',e.reason)
else:
print("good!")
print(response.read().decode("utf8"))
7、异常处理2
#!/usr/bin/envpython3
fromurllib.requestimportRequest,urlopen
fromurllib.errorimportURLError
req=Request("http://twitter.com/")
try:
response=urlopen(req)
exceptURLErrorase:
ifhasattr(e,'reason'):
print('Wefailedtoreachaserver.')
print('Reason:',e.reason)
elifhasattr(e,'code'):
print('Theservercouldn\'tfulfilltherequest.')
print('Errorcode:',e.code)
else:
print("good!")
print(response.read().decode("utf8"))
8、HTTP认证
#!/usr/bin/envpython3
importurllib.request
#createapasswordmanager
password_mgr=urllib.request.HTTPPasswordMgrWithDefaultRealm()
#Addtheusernameandpassword.
#Ifweknewtherealm,wecoulduseitinsteadofNone.
top_level_url="https://cms.tetx.com/"
password_mgr.add_password(None,top_level_url,'yzhang','cccddd')
handler=urllib.request.HTTPBasicAuthHandler(password_mgr)
#create"opener"(OpenerDirectorinstance)
opener=urllib.request.build_opener(handler)
#usetheopenertofetchaURL
a_url="https://cms.tetx.com/"
x=opener.open(a_url)
print(x.read())
#Installtheopener.
#Nowallcallstourllib.request.urlopenuseouropener.
urllib.request.install_opener(opener)
a=urllib.request.urlopen(a_url).read().decode('utf8')
print(a)
9、使用代理
#!/usr/bin/envpython3
importurllib.request
proxy_support=urllib.request.ProxyHandler({'sock5':'localhost:1080'})
opener=urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
a=urllib.request.urlopen("http://g.cn").read().decode("utf8")
print(a)
10、超时
#!/usr/bin/envpython3
importsocket
importurllib.request
#timeoutinseconds
timeout=2
socket.setdefaulttimeout(timeout)
#thiscalltourllib.request.urlopennowusesthedefaulttimeout
#wehavesetinthesocketmodule
req=urllib.request.Request('http://twitter.com/')
a=urllib.request.urlopen(req).read()
print(a)
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。