"在Python的世界里,urllib是通往互联网的万能钥匙。" —— Python社区格言
在Python生态系统中,当提到网络请求时,大多数人会立刻想到requests库。然而,Python标准库中的urllib模块却常常被忽视。作为Python内置的HTTP客户端工具集,urllib提供了强大而灵活的网络通信能力,无需额外依赖即可完成绝大多数网络操作。
Python3中的urllib实际上是四个子模块的集合:
urllib.request
:打开和读取URL
urllib.error
:处理请求异常
urllib.parse
:解析URL
urllib.robotparser
:解析robots.txt文件
本文将深入探索urllib的各个方面,展示它如何成为Python开发者处理网络请求的瑞士军刀。
urllib.request的核心是urlopen()
函数,它可以处理大多数HTTP请求:
from urllib.request import urlopen
# 基本GET请求
with urlopen('https://api.github.com') as response:
print(f"状态码: {response.status}")
print(f"响应头: {response.headers['content-type']}")
data = response.read()
print(f"内容长度: {len(data)} 字节")
通过Request
对象,我们可以实现更复杂的请求:
from urllib.request import Request, urlopen
# 创建自定义请求
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
'Accept': 'application/json',
'Authorization': 'Bearer YOUR_TOKEN'
}
req = Request(
url='https://api.example.com/data',
headers=headers,
method='GET'
)
# 发送请求并处理响应
with urlopen(req) as res:
if res.status == 200:
print(res.read().decode('utf-8'))
发送表单数据或JSON内容同样简单:
import json
from urllib.request import Request, urlopen
from urllib.parse import urlencode
# 表单数据POST
form_data = urlencode({
'username': 'admin',
'password': 'securepassword'
}).encode('utf-8')
req = Request(
'https://example.com/login',
data=form_data,
method='POST'
)
# JSON数据POST
json_data = json.dumps({'title': 'New Post', 'content': 'Lorem ipsum'}).encode('utf-8')
json_req = Request(
'https://api.example.com/posts',
data=json_data,
headers={'Content-Type': 'application/json'},
method='POST'
)
URL解析是网络编程中的基础操作,urllib.parse提供了强大的工具集。
from urllib.parse import urlparse, urlunparse
# 解析URL
url = "https://www.example.com:8080/path/to/page?query=python#section"
parsed = urlparse(url)
print(f"协议: {parsed.scheme}")
print(f"域名: {parsed.hostname}")
print(f"端口: {parsed.port}")
print(f"路径: {parsed.path}")
print(f"查询参数: {parsed.query}")
print(f"锚点: {parsed.fragment}")
# 构建URL
new_url = urlunparse((
'https',
'subdomain.example.com',
'/new/path',
'',
'category=books&lang=en',
'chapter3'
))
print(f"构建的URL: {new_url}")
from urllib.parse import parse_qs, parse_qsl, urlencode
# 解析查询字符串
query_str = "name=John&age=30&skills=python&skills=javascript"
params_dict = parse_qs(query_str)
params_list = parse_qsl(query_str)
print("解析为字典:", params_dict)
print("解析为列表:", params_list)
# 构建查询字符串
new_params = urlencode({
'search': 'urllib tutorial',
'page': 2,
'lang': ['en', 'fr']
}, doseq=True)
print("编码的查询字符串:", new_params)
网络请求充满不确定性,urllib.error提供了专业的异常处理机制。
from urllib.error import URLError, HTTPError
from urllib.request import urlopen
try:
response = urlopen("https://example.com/non-existent-page")
except HTTPError as e:
print(f"HTTP错误 {e.code}: {e.reason}")
if e.code == 404:
print("页面不存在!")
elif e.code == 500:
print("服务器内部错误")
except URLError as e:
print(f"URL错误: {e.reason}")
if isinstance(e.reason, socket.timeout):
print("连接超时")
except Exception as e:
print(f"其他错误: {type(e).__name__}: {e}")
import socket
from urllib.error import URLError
from urllib.request import urlopen
def safe_urlopen(url, timeout=10, retries=3):
for attempt in range(retries):
try:
with urlopen(url, timeout=timeout) as response:
return response.read()
except socket.timeout:
print(f"超时重试 {attempt+1}/{retries}")
except URLError as e:
if isinstance(e.reason, socket.timeout):
print(f"超时重试 {attempt+1}/{retries}")
else:
raise
raise TimeoutError(f"在{retries}次重试后请求失败")
# 使用自定义处理器
html_content = safe_urlopen("https://example.com", timeout=2, retries=5)
from urllib.request import ProxyHandler, build_opener, install_opener
# 配置代理
proxies = {
'http': 'http://proxy.example.com:8080',
'https': 'https://proxy.example.com:8081',
}
proxy_handler = ProxyHandler(proxies)
opener = build_opener(proxy_handler)
install_opener(opener) # 全局安装
# 现在所有请求都通过代理
with urlopen("https://www.example.com") as response:
print("通过代理访问成功")
import http.cookiejar
from urllib.request import HTTPCookieProcessor, build_opener
# 创建Cookie处理器
cookie_jar = http.cookiejar.CookieJar()
cookie_handler = HTTPCookieProcessor(cookie_jar)
# 创建自定义opener
opener = build_opener(cookie_handler)
# 登录请求
login_data = urlencode({
'username': 'user',
'password': 'pass'
}).encode('utf-8')
login_req = Request(
'https://example.com/login',
data=login_data,
method='POST'
)
# 发送登录请求
opener.open(login_req)
# 访问需要认证的页面
profile_req = Request('https://example.com/profile')
with opener.open(profile_req) as res:
print("用户资料:", res.read().decode())
import os
from urllib.request import urlopen
def download_file(url, file_path, chunk_size=8192):
# 获取文件信息
with urlopen(url) as response:
file_size = int(response.headers['Content-Length'])
print(f"文件大小: {file_size/1024/1024:.2f} MB")
# 检查本地文件
resume_position = 0
if os.path.exists(file_path):
resume_position = os.path.getsize(file_path)
print(f"发现已下载部分: {resume_position} 字节")
# 断点续传
headers = {'Range': f'bytes={resume_position}-'} if resume_position else {}
req = Request(url, headers=headers)
with urlopen(req) as res, open(file_path, 'ab') as f:
while True:
chunk = res.read(chunk_size)
if not chunk:
break
f.write(chunk)
downloaded = os.path.getsize(file_path)
progress = downloaded / file_size * 100
print(f"\r下载进度: {progress:.1f}%", end='')
print(f"\n下载完成: {file_path}")
# 使用示例
download_file(
"https://example.com/large-file.zip",
"large-file.zip"
)
在编写网络爬虫时,遵守robots.txt协议是基本的网络礼仪。
from urllib.robotparser import RobotFileParser
from urllib.parse import urlparse
def can_fetch_url(url, user_agent='*'):
parsed = urlparse(url)
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
rp = RobotFileParser()
rp.set_url(robots_url)
rp.read()
return rp.can_fetch(user_agent, url)
# 检查爬取权限
url_to_check = "https://example.com/private/page.html"
if can_fetch_url(url_to_check, 'MyBot'):
print("允许爬取")
else:
print("被robots.txt禁止爬取")
import time
from urllib.robotparser import RobotFileParser
from urllib.request import urlopen
class PoliteCrawler:
def __init__(self, user_agent='MyCrawler'):
self.user_agent = user_agent
self.rp_cache = {}
self.last_fetch = {}
def can_fetch(self, url):
parsed = urlparse(url)
base_url = f"{parsed.scheme}://{parsed.netloc}"
# 获取或创建RobotFileParser
if base_url not in self.rp_cache:
rp = RobotFileParser()
rp.set_url(f"{base_url}/robots.txt")
try:
rp.read()
self.rp_cache[base_url] = rp
except Exception:
# 无法读取robots.txt时默认允许
return True
# 检查爬取权限
return self.rp_cache[base_url].can_fetch(self.user_agent, url)
def fetch(self, url):
if not self.can_fetch(url):
raise PermissionError(f"被robots.txt禁止: {url}")
# 遵守Crawl-delay
parsed = urlparse(url)
base_url = f"{parsed.scheme}://{parsed.netloc}"
if base_url in self.last_fetch:
delay = self.rp_cache[base_url].crawl_delay(self.user_agent) or 1
elapsed = time.time() - self.last_fetch[base_url]
if elapsed < delay:
time.sleep(delay - elapsed)
# 发送请求
response = urlopen(url)
self.last_fetch[base_url] = time.time()
return response.read()
# 使用示例
crawler = PoliteCrawler()
content = crawler.fetch("https://example.com/public/page.html")
让我们使用urllib创建一个GitHub API客户端:
import json
from urllib.request import Request, urlopen
from urllib.error import HTTPError
from urllib.parse import urlencode, urljoin
BASE_URL = "https://api.github.com/"
class GitHubAPI:
def __init__(self, token=None):
self.token = token
self.headers = {
'Accept': 'application/vnd.github.v3+json',
'User-Agent': 'GitHubAPI/1.0'
}
if token:
self.headers['Authorization'] = f'token {token}'
def _make_request(self, endpoint, method='GET', params=None, data=None):
url = urljoin(BASE_URL, endpoint)
# 处理查询参数
if params:
url += '?' + urlencode(params)
# 准备请求数据
json_data = None
if data:
json_data = json.dumps(data).encode('utf-8')
self.headers['Content-Type'] = 'application/json'
req = Request(
url,
data=json_data,
headers=self.headers,
method=method
)
try:
with urlopen(req) as response:
return json.loads(response.read().decode('utf-8'))
except HTTPError as e:
error_msg = json.loads(e.read().decode('utf-8'))
raise APIException(e.code, error_msg.get('message', 'Unknown error'))
def get_user(self, username):
return self._make_request(f'users/{username}')
def list_repos(self, username, sort='created', direction='desc'):
params = {'sort': sort, 'direction': direction}
return self._make_request(f'users/{username}/repos', params=params)
def create_repo(self, name, description="", private=False):
data = {
'name': name,
'description': description,
'private': private
}
return self._make_request('user/repos', method='POST', data=data)
def search_repos(self, query, language=None):
params = {'q': query}
if language:
params['q'] += f' language:{language}'
return self._make_request('search/repositories', params=params)
class APIException(Exception):
def __init__(self, status_code, message):
self.status_code = status_code
self.message = message
super().__init__(f"{status_code}: {message}")
# 使用示例
if __name__ == "__main__":
# 创建API实例(使用自己的GitHub token)
api = GitHubAPI(token="YOUR_GITHUB_TOKEN")
# 获取用户信息
user = api.get_user("torvalds")
print(f"用户: {user['login']}, 创建于: {user['created_at']}")
# 搜索仓库
repos = api.search_repos("machine learning", language="python")
print(f"找到 {repos['total_count']} 个Python机器学习仓库")
# 创建新仓库(需要认证)
try:
new_repo = api.create_repo("my-new-project", "使用urllib创建的仓库")
print(f"仓库创建成功: {new_repo['html_url']}")
except APIException as e:
print(f"创建仓库失败: {e}")
from http.client import HTTPConnection
from urllib.request import HTTPHandler
class PoolManager(HTTPHandler):
_connection_pool = {}
def http_open(self, req):
return self.do_open(self.get_connection, req)
def get_connection(self, host, timeout=10):
key = (host, timeout)
if key not in self._connection_pool:
self._connection_pool[key] = HTTPConnection(host, timeout=timeout)
return self._connection_pool[key]
# 安装自定义处理器
opener = build_opener(PoolManager())
install_opener(opener)
# 现在所有请求都使用连接池
urlopen("https://example.com")
import threading
import queue
from urllib.request import urlopen
class DownloadWorker(threading.Thread):
def __init__(self, task_queue):
super().__init__()
self.task_queue = task_queue
def run(self):
while True:
url, save_path = self.task_queue.get()
if url is None:
break
try:
with urlopen(url) as response, open(save_path, 'wb') as f:
while True:
chunk = response.read(8192)
if not chunk:
break
f.write(chunk)
print(f"下载完成: {save_path}")
except Exception as e:
print(f"下载失败 {url}: {e}")
finally:
self.task_queue.task_done()
class ParallelDownloader:
def __init__(self, num_workers=4):
self.task_queue = queue.Queue()
self.workers = []
for _ in range(num_workers):
worker = DownloadWorker(self.task_queue)
worker.start()
self.workers.append(worker)
def add_download(self, url, save_path):
self.task_queue.put((url, save_path))
def wait_completion(self):
self.task_queue.join()
# 停止工作线程
for _ in range(len(self.workers)):
self.task_queue.put((None, None))
for worker in self.workers:
worker.join()
# 使用示例
downloader = ParallelDownloader(num_workers=5)
files = [
("https://example.com/file1.zip", "file1.zip"),
("https://example.com/file2.zip", "file2.zip"),
("https://example.com/large/video.mp4", "video.mp4"),
# 添加更多文件...
]
for url, path in files:
downloader.add_download(url, path)
downloader.wait_completion()
特性 | urllib | requests |
---|---|---|
安装要求 | Python标准库 | 需要pip安装 |
学习曲线 | 较陡峭 | 平缓 |
API设计 | 底层、灵活 | 高级、简洁 |
功能完整性 | 基础HTTP功能 | 更丰富的功能集 |
性能 | 更高(无额外开销) | 略低(抽象层) |
社区支持 | 标准库文档 | 丰富的社区资源 |
推荐选择原则:
import ssl
from urllib.request import urlopen
# 创建安全上下文
ssl_context = ssl.create_default_context()
ssl_context.check_hostname = True
ssl_context.verify_mode = ssl.CERT_REQUIRED
# 安全请求
try:
with urlopen("https://secure.example.com", context=ssl_context) as response:
# 处理安全连接
pass
except ssl.SSLCertVerificationError as e:
print(f"证书验证失败: {e}")
from urllib.request import urlopen
from urllib.error import URLError
import socket
def safe_urlopen(url, timeout=10, max_redirects=5):
redirects = 0
while redirects <= max_redirects:
try:
with urlopen(url, timeout=timeout) as response:
if 300 <= response.status < 400:
url = response.headers['Location']
redirects += 1
continue
return response
except socket.timeout:
raise TimeoutError(f"连接超时: {url}")
except URLError as e:
if isinstance(e.reason, socket.timeout):
raise TimeoutError(f"连接超时: {url}")
raise
raise RuntimeError(f"重定向超过{max_redirects}次")
# 使用安全请求
response = safe_urlopen("https://example.com")
urllib作为Python标准库中的网络工具集,提供了从基础到高级的HTTP通信能力。通过本文的探索,我们了解了:
虽然requests库因其简洁API广受欢迎,但urllib仍然是Python开发者的重要工具,尤其是在需要避免外部依赖或深入理解HTTP协议底层运作时。
正如Python之禅所说:"显式优于隐式"。使用urllib可能需要更多代码,但它提供了对HTTP通信过程的精确控制,这是理解网络编程本质的宝贵机会。