网络爬虫识别并处理页面重定向通常有以下几种常见的方式:
pythonimport requests
url = "http://example.com/old_page"
response = requests.get(url)
# 如果发生重定向,requests库会自动处理,最终response中包含的是重定向后页面的内容
print(response.text)
pythonimport requests
url = "http://example.com/old_page"
response = requests.get(url, allow_redirects=False)
if response.status_code in (301, 302):
redirect_url = response.headers['Location']
print(f"页面将重定向到: {redirect_url}")
# 可以选择是否继续请求重定向后的URL
new_response = requests.get(redirect_url)
print(new_response.text)
else:
print(response.text)
pythonimport requests
url = "http://example.com/start_page"
while True:
response = requests.get(url, allow_redirects=False)
if response.status_code in (301, 302):
url = response.headers['Location']
else:
break
print(response.text)
pythonimport requests
url = "http://example.com/old_page"
response = requests.get(url)
if response.history:
print("发生了重定向,重定向历史如下:")
for resp in response.history:
print(f"状态码: {resp.status_code}, URL: {resp.url}")
print(f"最终响应状态码: {response.status_code}")
print(response.text)