urllib的使用

#使用urllib
#發送請求
#1.urlopen
import urllib.request

response = urllib.request.urlopen('https://www.python.org')
#print(response.read().decode('utf-8'))
#利用 type 方法輸出響應的類型
print(type(response))
print(response.status)
print(response.getheaders())
print(response.getheader('Sercer'))

#傳遞參數
#urllib.request.urlopen(url, data=None, [timeout,]*, cafile=None, capath=None, cadefault=False, context=None)

#data 參數是可選的。如果要添加該參數，需要使用 bytes 方法將參數轉化為字節流編碼格式的內容，即 bytes 類型
#另外，如果傳遞了這個參數，則它的請求方式就不再是 GET 方式，而是 POST 方式
#在這裡請求的站點是 httpbin.org，它可以提供 HTTP 請求測試
import urllib.parse

data = bytes(urllib.parse.urlencode({'word':'hello'}),encoding = 'utf8')
response = urllib.request.urlopen('http://httpbin.org/post',data=data)
print(response.read())

b'{\n  "args": {}, \n  "data": "", \n  "files": {}, \n  "form": {\n    "word": "hello"\n  }, \n  "headers": {\n    "Accept-Encoding": "identity", \n    "Content-Length": "10", \n    "Content-Type": "application/x-www-form-urlencoded", \n    "Host": "httpbin.org", \n    "User-Agent": "Python-urllib/3.12", \n    "X-Amzn-Trace-Id": "Root=1-67c1704f-4eb3a4ac2b4e81cb47c01cfc"\n  }, \n  "json": null, \n  "origin": "154.40.60.12", \n  "url": "http://httpbin.org/post"\n}\n'

#通過設置超時時間來控制一個網頁如果長時間未響應，就跳過它的抓取
import socket   
import urllib.error  

try:  
    response = urllib.request.urlopen('http://httpbin.org/get', timeout=0.1)  
except urllib.error.URLError as e:  
    if isinstance(e.reason, socket.timeout):  
        print('TIME OUT')

#2. Request
#class urllib.request.Request(url, data=None, headers={}, origin_req_host=None, unverifiable=False, method=None)
#必選參數url data必須傳類型的 調用實例add_header()添加headers參數

import urllib.request  

request = urllib.request.Request('https://python.org')  
response = urllib.request.urlopen(request)  
print(response.read().decode('utf-8'))

from urllib import parse,request
url = 'http://httpbin.org/post'
headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)','Host':'httpbin.org'}
dict = {'name':'Germey'}
data = bytes(parse.urlencode(dict),encoding='utf8')
req = request.Request(url,data=data,headers=headers,method='POST')
response = request.urlopen(req)
print(response.read().decode('utf-8'))

{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "name": "Germey"
  }, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Content-Length": "11", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)", 
    "X-Amzn-Trace-Id": "Root=1-67c17458-6ec1e2f400a80e126995c7e6"
  }, 
  "json": null, 
  "origin": "154.40.60.12", 
  "url": "http://httpbin.org/post"
}

#3.高級用法
#各種處理器，有專門處理登錄驗證的，有處理 Cookies 的，有處理代理設置的
#urllib.request 模塊裡的 BaseHandler 類，它是所有其他 Handler 的父類，它提供了最基本的方法，例如 default_open、protocol_request 等
#另一個比較重要的類就是 OpenerDirector，我們可以稱為 Opener ,簡而言之，就是利用 Handler 來構建 Opener

#HTTPBasicAuthHandler 用於管理認證，如果一個鏈接打開時需要認證，那麼可以用它來解決認證問題
from urllib.request import HTTPPasswordMgrWithDefaultRealm,HTTPBasicAuthHandler,build_opener
from urllib.error import URLError

username = 'admin'
password = '*******'
url = 'http://127.0.0.1:5244/'

p =HTTPPasswordMgrWithDefaultRealm()
p.add_password(None,url,username,password)
auth_handler = HTTPBasicAuthHandler()
opener = build_opener(auth_handler)
try:
    result = opener.open(url)
    html = result.read().decode('utf-8')
    print(html)
except URLError as e:
    print(e.reason)

#代理,以快代理為例
from urllib.request import ProxyHandler
import requests
api = "https://dps.kdlapi.com/api/getdps"

# 請求參數
params = {
    "secret_id": "你的id",
    "signature": "你的簽名",
    "num": 1,   # 提取數量
}
# 獲取響應內容
response = requests.get(api, params=params)
# 解析代理 IP
proxy_ip = response.text.strip()  # 去除多餘的空白字符和換行符
# 檢查是否獲取到有效的代理 IP
if not proxy_ip:
    print("未獲取到有效的代理 IP")
else:
    # 定義代理
    username = "*******"  # 替換為你的用戶名
    password = "*******"  # 替換為你的密碼
    proxy = f"https://{username}:{password}@{proxy_ip}"
    print(f"獲取的代理 IP: {proxy_ip}")
# 創建一個 ProxyHandler
proxy_handler = urllib.request.ProxyHandler({'http': proxy, 'https': proxy})

# 創建一個 opener
opener = urllib.request.build_opener(proxy_handler)
try:
    response = opener.open('https://www.baidu.com') 
    print(response.read().decode('utf-8'))
except URLError as e:
    print(e.reason)

獲取的代理 IP: 218.95.37.135:40358
<html>
<head>
	<script>
		location.replace(location.href.replace("https://","http://"));
	</script>
</head>
<body>
	<noscript><meta http-equiv="refresh" content="0;url=http://www.baidu.com/"></noscript>
</body>
</html>

#Cookies
#將網站的 Cookies 獲取下來
import http.cookiejar,urllib.request

#聲明一個 CookieJar 對象
cookie = http.cookiejar.CookieJar()
#利用 HTTPCookieProcessor 來構建一個 Handler
handler = urllib.request.HTTPCookieProcessor(cookie)
#利用 build_opener 方法構建出 Opener，執行 open 函數
opener = urllib.request.build_opener(handler)
response = opener.open('https://www.baidu.com')
for item in cookie:
    print(item.name+"="+item.value)

BD_NOT_HTTPS=1
BIDUPSID=BFE0F5D5293A45F6AEC0BA9BA07B81DA
PSTM=1740734698
BAIDUID=BFE0F5D5293A45F6457A648A94C15082:FG=1

#Cookies以文本形式保存
filename = 'cookies.txt'  
#CookieJar 就需要換成 MozillaCookieJar,用來處理 Cookies 和文件相關的事件
cookie = http.cookiejar.MozillaCookieJar(filename)  
handler = urllib.request.HTTPCookieProcessor(cookie)  
opener = urllib.request.build_opener(handler)  
response = opener.open('http://www.baidu.com')  
cookie.save(ignore_discard=True, ignore_expires=True)

#處理異常
#1. URLError
from urllib import request,error
try:
    response = request.urlopen('https://cuiqingcai.com/index.htm')
except error.URLError as e:
    print(e.reason)

Not Found

#2.HTTPError
try:
    response = request.urlopen('https://cuiqingcai.com/index.htm')
except error.HTTPError as e:
    print(e.reason, e.code, e.headers,sep='\n')
#URLError 是 HTTPError 的父類，所以可以先選擇捕獲子類的錯誤，再去捕獲父類的錯誤
try:  
    response = request.urlopen('https://cuiqingcai.com/index.htm')  
except error.HTTPError as e:  
    print(e.reason, e.code, e.headers, sep='\n')  
except error.URLError as e:  
    print(e.reason)  
else:  
    print('Request Successfully')

#解析鏈接
#處理 URL 的標準接口，例如實現 URL 各部分的抽取、合併以及鏈接轉換
#1. urlparse
#urllib.parse.urlparse(urlstring, scheme='', allow_fragments=True)
from urllib.parse import urlparse

result = urlparse('http://www.baidu.com/index.html;user?id=5#comment')
print(type(result),result)
#標準鏈接格式：scheme://netloc/path;params?query#fragment

<class 'urllib.parse.ParseResult'> ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')

# ParseResult 實際上是一個元組，我們可以用索引順序來獲取，也可以用屬性名獲取
result = urlparse('http://www.baidu.com/index.html#comment', allow_fragments=False)  
print(result.scheme, result[0], result.netloc, result[1], sep='\n')

http
http
www.baidu.com
www.baidu.com

#2. urlunparse
from urllib.parse import urlunparse

data = ['http', 'www.baidu.com', 'index.html', 'user', 'a=6', 'comment']
print(urlunparse(data))

http://www.baidu.com/index.html;user?a=6#comment

#3. urlsplit
#和 urlparse 方法非常相似，它不再單獨解析 params 這一部分，只返回 5 個結果, params 會合併到 path 中
from urllib.parse import urlsplit  

result = urlsplit('http://www.baidu.com/index.html;user?id=5#comment')  
print(result)

SplitResult(scheme='http', netloc='www.baidu.com', path='/index.html;user', query='id=5', fragment='comment')

#4. urlunsplit
from urllib.parse import urlunsplit  

data = ['http', 'www.baidu.com', 'index.html', 'a=6', 'comment']  
print(urlunsplit(data))

http://www.baidu.com/index.html?a=6#comment

#5. urljoin
#urlunparse 和 urlunsplit 可以完成鏈接的合併，不過必須要有特定長度的對象，鏈接的每一部分都要清晰分開
from urllib.parse import urljoin  

print(urljoin('http://www.baidu.com', 'FAQ.html'))  
print(urljoin('http://www.baidu.com', 'https://cuiqingcai.com/FAQ.html'))  
print(urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/FAQ.html'))  
print(urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/FAQ.html?question=2'))  
print(urljoin('http://www.baidu.com?wd=abc', 'https://cuiqingcai.com/index.php'))  
print(urljoin('http://www.baidu.com', '?category=2#comment'))  
print(urljoin('www.baidu.com', '?category=2#comment'))  
print(urljoin('www.baidu.com#comment', '?category=2'))

http://www.baidu.com/FAQ.html
https://cuiqingcai.com/FAQ.html
https://cuiqingcai.com/FAQ.html
https://cuiqingcai.com/FAQ.html?question=2
https://cuiqingcai.com/index.php
http://www.baidu.com?category=2#comment
www.baidu.com?category=2#comment
www.baidu.com?category=2

#6. urlencode:構造 GET 請求參數
from urllib.parse import urlencode

params = {
    'name':'germey',
    'age':22
}
base_url = 'http://baidu.com?'
url = base_url + urlencode(params)
print(url)

http://baidu.com?name=germey&age=22

#7. parse_qs 反序列化,將GET請求參數轉回字典
from urllib.parse import parse_qs  

query = 'name=germey&age=22'  
print(parse_qs(query))

#8. parse_qsl 將參數轉化為元組組成的列表
from urllib.parse import parse_qsl  

query = 'name=germey&age=22'  
print(parse_qsl(query))

{'name': ['germey'], 'age': ['22']}
[('name', 'germey'), ('age', '22')]

#9. quote:將中文字符轉化為 URL 編碼
from urllib.parse import quote  

keyword = ' 壁紙 '  
url = 'https://www.baidu.com/s?wd=' + quote(keyword)  
print(url)

#10.unquote:進行 URL 解碼
from urllib.parse import unquote  

url = 'https://www.baidu.com/s?wd=%E5%A3%81%E7%BA%B8'  
print(unquote(url))

https://www.baidu.com/s?wd=%20%E5%A3%81%E7%BA%B8%20
https://www.baidu.com/s?wd=壁紙

#分析 Robots 協議
#robotparser
from urllib.robotparser import RobotFileParser
rp = RobotFileParser()
rp.set_url('http://www.jianshu.com/robots.txt')
rp.read()
print(rp.can_fetch('*', 'http://www.jianshu.com/p/b67554025d7d'))
print(rp.can_fetch('*', "http://www.jianshu.com/search?q=python&page=1&type=collections"))

False
False