
在当今数据驱动的时代,网络爬虫已成为获取互联网信息的重要手段。然而,许多网站如12306都实施了严格的反爬虫机制,特别是对于动态加载的内容。本文将详细介绍如何使用Selenium模拟真实浏览器行为,有效绕过这些限制,成功抓取12306旅游产品数据。
12306作为中国铁路官方售票平台,对其旅游产品数据实施了多层次防护:
Selenium是一个自动化Web测试工具,但其浏览器自动化能力使其成为应对反爬策略的利器:
首先确保安装以下Python库:
bash
pip install selenium beautifulsoup4 pandas webdriver-manager
Selenium需要对应浏览器的驱动程序。推荐使用webdriver-manager自动管理驱动:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    chrome_options.add_argument('--start-maximized')  # 最大化窗口
    
    # 使用webdriver-manager自动下载和管理ChromeDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    # 执行CDP命令隐藏WebDriver特征
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
        'source': '''
            Object.defineProperty(navigator, 'webdriver', {
                get: () => undefined
            })
        '''
    })
    
    return driver12306旅游产品页面(https://kyfw.12306.cn/otn/product/index.html)采用动态加载方式,需要合理设置等待时间:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
def wait_for_element(driver, by, value, timeout=10):
    """等待元素出现"""
    try:
        element = WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located((by, value))
        )
        return element
    except Exception as e:
        print(f"元素加载超时: {value}")
        return None
def slow_scroll(driver, scroll_pause_time=0.5):
    """模拟人类缓慢滚动页面"""
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    while True:
        # 随机滚动距离,模拟人类行为
        scroll_height = random.randint(300, 800)
        driver.execute_script(f"window.scrollBy(0, {scroll_height});")
        time.sleep(scroll_pause_time)
        
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height当遇到验证码时,我们需要人工干预或使用OCR服务:
def handle_verification(driver):
    """处理验证码挑战"""
    try:
        # 检查是否有验证码出现
        captcha_element = wait_for_element(driver, By.ID, "code", timeout=5)
        if captcha_element:
            print("验证码出现,请手动解决...")
            # 暂停程序,等待用户手动处理验证码
            input("解决验证码后按回车键继续...")
            return True
    except:
        pass
    return False提取旅游产品信息的完整实现:
from bs4 import BeautifulSoup
import pandas as pd
import re
def extract_tour_products(driver):
    """提取旅游产品信息"""
    # 等待产品列表加载
    wait_for_element(driver, By.CLASS_NAME, "product-list", timeout=15)
    
    # 缓慢滚动加载所有内容
    slow_scroll(driver)
    
    # 获取页面源码并使用BeautifulSoup解析
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    products = []
    product_items = soup.find_all('div', class_='product-item')
    
    for item in product_items:
        try:
            # 提取产品名称
            name_elem = item.find('div', class_='product-name')
            name = name_elem.text.strip() if name_elem else "未知产品"
            
            # 提取价格信息
            price_elem = item.find('div', class_='product-price')
            price_text = price_elem.text.strip() if price_elem else "0"
            price = re.search(r'(\d+)', price_text.replace(',', ''))
            price = int(price.group(1)) if price else 0
            
            # 提取出发地/目的地
            route_elem = item.find('div', class_='product-route')
            route = route_elem.text.strip() if route_elem else ""
            
            # 提取产品详情链接
            link_elem = item.find('a', href=True)
            link = "https://kyfw.12306.cn" + link_elem['href'] if link_elem else ""
            
            # 提取产品图片
            img_elem = item.find('img', src=True)
            img_url = img_elem['src'] if img_elem else ""
            
            products.append({
                'name': name,
                'price': price,
                'route': route,
                'link': link,
                'image_url': img_url,
                'crawled_time': pd.Timestamp.now()
            })
            
        except Exception as e:
            print(f"提取产品信息时出错: {e}")
            continue
    
    return products整合以上功能的完整实现:
import random
import json
from datetime import datetime
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re
# 代理信息
proxyHost = "www.16yun.cn"
proxyPort = "5445"
proxyUser = "16QMSOML"
proxyPass = "280651"
def get_proxy_url():
    """生成带认证的代理URL"""
    return f"http://{proxyUser}:{proxyPass}@{proxyHost}:{proxyPort}"
def setup_driver():
    """设置带代理的浏览器实例"""
    chrome_options = Options()
    
    # 添加代理服务器配置(带认证)
    proxy_url = get_proxy_url()
    chrome_options.add_argument(f'--proxy-server={proxy_url}')
    
    # 其他配置
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    chrome_options.add_argument('--start-maximized')
    
    # 使用webdriver-manager自动下载和管理ChromeDriver
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    # 执行CDP命令隐藏WebDriver特征
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
        'source': '''
            Object.defineProperty(navigator, 'webdriver', {
                get: () => undefined
            })
        '''
    })
    
    return driver
def wait_for_element(driver, by, value, timeout=10):
    """等待元素出现"""
    try:
        element = WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located((by, value))
        )
        return element
    except Exception as e:
        print(f"元素加载超时: {value}")
        return None
def slow_scroll(driver, scroll_pause_time=0.5):
    """模拟人类缓慢滚动页面"""
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    while True:
        # 随机滚动距离,模拟人类行为
        scroll_height = random.randint(300, 800)
        driver.execute_script(f"window.scrollBy(0, {scroll_height});")
        time.sleep(scroll_pause_time)
        
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
def handle_verification(driver):
    """处理验证码挑战"""
    try:
        # 检查是否有验证码出现
        captcha_element = wait_for_element(driver, By.ID, "code", timeout=5)
        if captcha_element:
            print("验证码出现,请手动解决...")
            # 暂停程序,等待用户手动处理验证码
            input("解决验证码后按回车键继续...")
            return True
    except:
        pass
    return False
def extract_tour_products(driver):
    """提取旅游产品信息"""
    # 等待产品列表加载
    wait_for_element(driver, By.CLASS_NAME, "product-list", timeout=15)
    
    # 缓慢滚动加载所有内容
    slow_scroll(driver)
    
    # 获取页面源码并使用BeautifulSoup解析
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    products = []
    product_items = soup.find_all('div', class_='product-item')
    
    for item in product_items:
        try:
            # 提取产品名称
            name_elem = item.find('div', class_='product-name')
            name = name_elem.text.strip() if name_elem else "未知产品"
            
            # 提取价格信息
            price_elem = item.find('div', class_='product-price')
            price_text = price_elem.text.strip() if price_elem else "0"
            price = re.search(r'(\d+)', price_text.replace(',', ''))
            price = int(price.group(1)) if price else 0
            
            # 提取出发地/目的地
            route_elem = item.find('div', class_='product-route')
            route = route_elem.text.strip() if route_elem else ""
            
            # 提取产品详情链接
            link_elem = item.find('a', href=True)
            link = "https://kyfw.12306.cn" + link_elem['href'] if link_elem else ""
            
            # 提取产品图片
            img_elem = item.find('img', src=True)
            img_url = img_elem['src'] if img_elem else ""
            
            products.append({
                'name': name,
                'price': price,
                'route': route,
                'link': link,
                'image_url': img_url,
                'crawled_time': pd.Timestamp.now()
            })
            
        except Exception as e:
            print(f"提取产品信息时出错: {e}")
            continue
    
    return products
def crawl_12306_tours():
    """爬取12306旅游产品的完整流程"""
    print("启动浏览器(使用代理服务器)...")
    print(f"代理服务器: {proxyHost}:{proxyPort}")
    
    driver = setup_driver()
    
    try:
        # 访问12306旅游产品页面
        print("访问12306旅游产品页面...")
        driver.get("https://kyfw.12306.cn/otn/product/index.html")
        
        # 等待页面加载
        time.sleep(3)
        
        # 检查并处理验证码
        if handle_verification(driver):
            # 验证码处理后重新等待页面加载
            time.sleep(3)
        
        # 提取产品信息
        print("正在提取旅游产品信息...")
        products = extract_tour_products(driver)
        
        # 保存数据
        if products:
            df = pd.DataFrame(products)
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"12306_tours_{timestamp}.csv"
            df.to_csv(filename, index=False, encoding='utf-8-sig')
            print(f"成功提取 {len(products)} 个旅游产品,已保存到 {filename}")
            
            # 同时保存JSON格式
            json_filename = f"12306_tours_{timestamp}.json"
            with open(json_filename, 'w', encoding='utf-8') as f:
                json.dump(products, f, ensure_ascii=False, indent=2)
            print(f"JSON数据已保存到 {json_filename}")
        else:
            print("未找到旅游产品信息")
            
        return products
        
    except Exception as e:
        print(f"爬取过程中发生错误: {e}")
        return []
    
    finally:
        # 关闭浏览器
        driver.quit()
        print("浏览器已关闭")
# 执行爬虫
if __name__ == "__main__":
    crawl_12306_tours()在使用Selenium爬取12306数据时,必须注意以下道德和法律问题:
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。