现代反爬系统通过TLS指纹识别自动化工具,传统requests库已不再安全。解决方案:
# 使用tls_client模拟真实浏览器指纹
import tls_client
session = tls_client.Session(
client_identifier="chrome_120",
random_tls_extension_order=True
)
response = session.get("https://target-site.com")
# 动态调整密码套件
ciphers = [
'TLS_AES_128_GCM_SHA256',
'TLS_CHACHA20_POLY1305_SHA256',
'TLS_AES_256_GCM_SHA384'
]
session.ciphers = ciphers
使用hyper框架实现真正的HTTP/2请求,避免被基于HTTP版本的特征检测拦截:
from hyper import HTTPConnection
conn = HTTPConnection('target-site.com:443', enable_push=True)
conn.request('GET', '/api/data', headers={
':method': 'GET',
':path': '/api/data',
':authority': 'target-site.com',
':scheme': 'https',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36...'
})
resp = conn.get_response()
反爬系统通过检测navigator.webdriver
等属性识别无头浏览器。解决方案:
# 使用Playwright时注入补丁
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(args=[
'--disable-blink-features=AutomationControlled',
'--use-fake-ui-for-media-stream'
])
page = browser.new_page()
# 覆盖webdriver属性
page.add_init_script("""
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
window.chrome = undefined;
""")
page.goto('https://target-site.com')
通过修改Canvas渲染结果欺骗指纹检测:
# 在浏览器中执行
const canvas = document.createElement('canvas');
const ctx = canvas.getContext('2d');
ctx.fillStyle = 'rgb(150, 150, 150)';
// 添加随机噪声
Math.floor(Math.random()*10) > 5 && ctx.fillRect(
Math.random()*10,
Math.random()*10,
1, 1
);
使用Charles+Proxifier抓包小程序,解密TLS流量:
# 解密WX小程序加密参数
import hashlib
def get_wx_signature(params):
key = 'wx_secret_key_2024'
raw = '&'.join([f"{k}={v}" for k,v in sorted(params.items())])
return hashlib.md5(f"{raw}&key={key}".encode()).hexdigest().upper()
使用Frida实时修改WASM内存数据:
// 注入Hook脚本
Interceptor.attach(Module.findExportByName("libencrypt.so", "encrypt_data"), {
onEnter: function(args) {
this.plaintext = Memory.readUtf8String(args[0]);
},
onLeave: function(retval) {
console.log(`加密前: ${this.plaintext}`);
console.log(`加密后: ${Memory.readByteArray(retval, 16)}`);
}
});
# 使用Scrapy中间件实现智能代理
class SmartProxyMiddleware:
def __init__(self, proxy_pool):
self.proxy_pool = proxy_pool # 代理池API接口
def process_request(self, request, spider):
region = request.meta.get('target_region', 'us')
proxy = self.proxy_pool.get(region)
request.meta['proxy'] = f"http://{proxy['ip']}:{proxy['port']}"
request.headers['X-Proxy-Session'] = proxy['session_id']
# 配合Redis实现代理状态监控
import redis
r = redis.Redis()
def validate_proxy(proxy):
if r.get(f"ban:{proxy}") is None:
# 执行验证逻辑
...
# 基于CNN的验证码识别微服务
import requests
def solve_captcha(image_bytes):
resp = requests.post(
"http://captcha-service:8000/predict",
files={'image': image_bytes},
headers={'Authorization': 'Bearer API_KEY'}
)
return resp.json()['result']
# 在Scrapy中间件中集成
from scrapy.pipelines.images import ImagesPipeline
class CaptchaPipeline(ImagesPipeline):
def captcha_required(self, response):
return 'captcha' in response.url
def process_item(self, item, spider):
if self.captcha_required(response):
img_data = response.body
captcha_text = solve_captcha(img_data)
return FormRequest.from_response(
response,
formdata={'captcha': captcha_text}
)
return item
# 数据敏感度分级函数
def data_classification(text):
sensitive_keywords = ['身份证', '银行卡', '手机号']
for kw in sensitive_keywords:
if kw in text:
return 'PII' # 个人身份信息
return 'Public'
# Scrapy数据清洗管道
from scrapy.exceptions import DropItem
class DataFilterPipeline:
def process_item(self, item, spider):
if data_classification(item['content']) == 'PII':
raise DropItem("Contains sensitive information")
return item
from urllib.robotparser import RobotFileParser
rp = RobotFileParser()
rp.set_url("https://target-site.com/robots.txt")
rp.read()
def is_allowed(url, user_agent='MyBot'):
if not rp.can_fetch(user_agent, url):
log.warning(f"URL {url} blocked by robots.txt")
return False
return True
使用GAN生成正常用户流量模式:
# 伪代码示例
generator = build_generator() # 生成器网络
discriminator = build_discriminator() # 判别器网络
for epoch in range(100):
# 生成对抗样本
fake_traffic = generator.predict(noise)
# 判别器训练
d_loss = discriminator.train_on_batch(
x=real_traffic + fake_traffic,
y=[1]*len(real_traffic) + [0]*len(fake_traffic)
)
# 生成器对抗训练
g_loss = combined_model.train_on_batch(
x=noise,
y=[1]*batch_size
)
将采集行为上链存证:
from web3 import Web3
w3 = Web3(Web3.HTTPProvider('https://mainnet.infura.io/v3/YOUR_KEY'))
contract_address = '0x123...'
contract_abi = [...] # 存证合约ABI
def log_crawl_action(action_hash):
contract = w3.eth.contract(address=contract_address, abi=contract_abi)
tx_hash = contract.functions.logAction(action_hash).transact({
'from': w3.eth.accounts[0],
'gas': 100000
})
return tx_hash
# 完整工作流示例
class EcommerceSpider(scrapy.Spider):
name = 'price_monitor'
def start_requests(self):
yield scrapy.Request(
url=self.proxy_pool.get_url('product_page'),
callback=self.parse_product,
meta={'proxy': True}
)
def parse_product(self, response):
# 动态解析加密价格
price_script = response.xpath('//script[contains(.,"encryptedPrice")]/text()').get()
price = decrypt_price(price_script) # 调用逆向解密函数
# 对抗动态渲染
if 'antispam' in response.text:
yield from self.handle_antispam(response)
yield {
'product': response.css('h1::text').get(),
'price': price,
'timestamp': datetime.now().isoformat()
}
def handle_antispam(self, response):
# 智能验证码处理
captcha_img = response.css('#captcha img::attr(src)').get()
captcha_text = solve_captcha(captcha_img)
yield FormRequest.from_response(
response,
formdata={'captcha': captcha_text},
callback=self.parse_product
)
爬虫技术已进入"道高一尺,魔高一丈"的深度对抗阶段。开发者需掌握:
建议构建持续学习体系:
记住:技术是中立的,但应用必须有边界。合法合规的数据采集,才能让技术创造真正的商业价值与社会效益。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。