Cookie是服务器发送到用户浏览器并保存在本地的一小段数据,用于维持用户会话状态。爬虫在模拟登录后,通常需要携带Cookie访问后续页面。
**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">Expires</font>**
或**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">Max-Age</font>**
属性,超时后失效。如果爬虫未正确处理Cookie过期问题,会导致:
**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">401/403</font>**
状态码**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">302</font>**
重定向到登录页)。**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">"请先登录"</font>**
)。import requests
def check_cookie_valid(session):
test_url = "https://example.com/user/profile" # 需要登录才能访问的页面
response = session.get(test_url)
if response.status_code == 200 and "个人中心" in response.text:
return True # Cookie有效
else:
return False # Cookie失效
如果服务器返回的Cookie带有**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">Expires</font>**
字段,可以解析并判断是否已过期。
from datetime import datetime
def is_cookie_expired(cookie):
if "expires" in cookie:
expires_time = datetime.strptime(cookie["expires"], "%a, %d-%b-%Y %H:%M:%S GMT")
return expires_time < datetime.now()
return False # 无过期时间或会话Cookie
当检测到Cookie失效时,自动调用登录接口更新Cookie。
def login(username, password):
login_url = "https://example.com/login"
session = requests.Session()
payload = {"username": username, "password": password}
response = session.post(login_url, data=payload)
if "登录成功" in response.text:
return session # 返回带新Cookie的Session
else:
raise Exception("登录失败")
**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">requests.Session()</font>**
可自动管理Cookie,但需结合存储机制(如文件、数据库)实现长期有效。
import pickle
def save_session(session, filename="session.pkl"):
with open(filename, "wb") as f:
pickle.dump(session.cookies, f)
def load_session(filename="session.pkl"):
session = requests.Session()
try:
with open(filename, "rb") as f:
session.cookies.update(pickle.load(f))
except FileNotFoundError:
pass # 首次运行无缓存
return session
import redis
import pickle
redis_client = redis.StrictRedis(host="localhost", port=6379, db=0)
def save_session_to_redis(session, key="example_cookie"):
redis_client.set(key, pickle.dumps(session.cookies))
def load_session_from_redis(key="example_cookie"):
session = requests.Session()
cookie_data = redis_client.get(key)
if cookie_data:
session.cookies.update(pickle.loads(cookie_data))
return session
某些网站采用JavaScript动态生成Cookie,可使用**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">selenium</font>**
模拟浏览器登录。
from selenium import webdriver
from selenium.webdriver.common.by import By
def selenium_login(username, password):
driver = webdriver.Chrome()
driver.get("https://example.com/login")
driver.find_element(By.NAME, "username").send_keys(username)
driver.find_element(By.NAME, "password").send_keys(password)
driver.find_element(By.XPATH, "//button[@type='submit']").click()
# 获取Cookie并转为requests可用的格式
cookies = driver.get_cookies()
session = requests.Session()
for cookie in cookies:
session.cookies.set(cookie["name"], cookie["value"])
driver.quit()
return session
避免因频繁登录触发反爬。
import requests
from requests.auth import HTTPProxyAuth
# 爬虫配置
LOGIN_URL = "https://example.com/login" # 登录页面的 URL
DATA_URL = "https://example.com/data" # 需要爬取数据的 URL
USERNAME = "your_username" # 用户名
PASSWORD = "your_password" # 密码
# 代理配置
proxyHost = "www.16yun.cn"
proxyPort = "5445"
proxyUser = "16QMSOML"
proxyPass = "280651"
# 构造代理地址
proxies = {
"http": f"http://{proxyUser}:{proxyPass}@{proxyHost}:{proxyPort}",
"https": f"http://{proxyUser}:{proxyPass}@{proxyHost}:{proxyPort}",
}
# 请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
}
# 登录函数
def login():
session = requests.Session()
login_data = {
"username": USERNAME,
"password": PASSWORD
}
response = session.post(LOGIN_URL, data=login_data, headers=headers, proxies=proxies, auth=HTTPProxyAuth(proxyUser, proxyPass))
if response.status_code == 200:
print("登录成功,获取到新的 Cookie")
return session
else:
print("登录失败")
return None
# 检测 Cookie 是否过期
def check_cookie(session):
response = session.get(DATA_URL, headers=headers, proxies=proxies)
if response.status_code == 401 or response.status_code == 403:
print("Cookie 过期,需要重新登录")
return False
elif "登录已失效" in response.text:
print("Cookie 过期,需要重新登录")
return False
else:
print("Cookie 仍然有效")
return True
# 主爬虫逻辑
def main():
session = login() # 首次登录获取 Cookie
if session is None:
print("无法登录,爬虫终止")
return
while True:
if check_cookie(session): # 检测 Cookie 是否过期
# 如果 Cookie 有效,继续爬取数据
response = session.get(DATA_URL, headers=headers, proxies=proxies)
if response.status_code == 200:
print("成功获取数据")
# 处理数据
print(response.text)
else:
print("数据获取失败")
else:
# 如果 Cookie 过期,重新登录
session = login()
if session is None:
print("重新登录失败,爬虫终止")
break
if __name__ == "__main__":
main()
**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">Expires</font>**
字段判断。**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">Session</font>**
持久化存储。**<font style="color:rgb(64, 64, 64);background-color:rgb(236, 236, 236);">selenium</font>**
获取动态生成的Cookie。通过合理管理Cookie,爬虫可以长期稳定运行,避免因登录失效导致的数据抓取中断。
扫码关注腾讯云开发者
领取腾讯云代金券
Copyright © 2013 - 2025 Tencent Cloud. All Rights Reserved. 腾讯云 版权所有
深圳市腾讯计算机系统有限公司 ICP备案/许可证号:粤B2-20090059 深公网安备号 44030502008569
腾讯云计算(北京)有限责任公司 京ICP证150476号 | 京ICP备11018762号 | 京公网安备号11010802020287
Copyright © 2013 - 2025 Tencent Cloud.
All Rights Reserved. 腾讯云 版权所有