无缝调用Java生态：用Jsoup轻松解析网页

原创

华科云商小徐

发布于 2025-09-03 11:20:30

1930

大家是不是好奇用Groovy写爬虫到底靠不靠谱？简单说，这玩意儿就像给Java插上了翅膀——既能直接用所有Java的牛逼库，又能用更简洁的代码快速搞事。比如用几行代码就能扒网页数据，处理JSON像吃糖一样简单，还能和Redis这些中间件无缝配合。特别适合急需验证想法或者团队里Java老哥多的场景，下面咱就上手整段真实能跑的代码瞧瞧！

下面是一个实用的Groovy爬虫代码，用于爬取图书信息网站并提取数据。这个示例展示了Groovy在爬虫开发中的简洁性和强大功能。

#!/usr/bin/env groovy

// 导入必要的库
@Grab('org.jsoup:jsoup:1.15.3')
@Grab('org.apache.httpcomponents:httpclient:4.5.13')
@Grab('com.fasterxml.jackson.core:jackson-databind:2.14.2')

import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.select.Elements
import org.apache.http.impl.client.HttpClients
import org.apache.http.client.methods.HttpGet
import org.apache.http.client.config.RequestConfig
import org.apache.http.util.EntityUtils
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.databind.SerializationFeature

/**
 * 图书信息类
 */
class Book {
    String title
    String author
    BigDecimal price
    String description
    Integer rating
    String url
    
    String toString() {
        "《${title}》- ${author} - 价格: ${price}元 - 评分: ${rating}星"
    }
}

/**
 * 网页爬虫类
 */
class BookScraper {
    // HTTP客户端配置
    private def httpClient
    private def config
    private def objectMapper
    
    BookScraper() {
        // 配置HTTP客户端（设置超时和代理等）
        config = RequestConfig.custom()
            .setConnectTimeout(5000)
            .setSocketTimeout(5000)
            .build()
        
        httpClient = HttpClients.custom()
            .setDefaultRequestConfig(config)
            .build()
            
        objectMapper = new ObjectMapper()
        objectMapper.enable(SerializationFeature.INDENT_OUTPUT)
    }
    
    /**
     * 获取网页内容
     */
    String fetchPage(String url) {
        try {
            println "正在抓取: $url"
            def httpGet = new HttpGet(url)
            // 设置请求头，模拟浏览器
            httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
            httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
            
            def response = httpClient.execute(httpGet)
            def entity = response.getEntity()
            def content = EntityUtils.toString(entity)
            EntityUtils.consume(entity)
            
            return content
        } catch (Exception e) {
            println "抓取网页时出错: ${e.message}"
            return null
        }
    }
    
    /**
     * 解析图书列表页
     */
    List<Book> parseBookList(String html, String baseUrl) {
        def books = []
        try {
            Document doc = Jsoup.parse(html)
            Elements bookElements = doc.select(".product_pod") // 假设每本书都在这个CSS类中
            
            bookElements.each { Element element ->
                try {
                    Book book = new Book()
                    
                    // 提取标题
                    book.title = element.select("h3 a").attr("title")
                    if (!book.title) {
                        book.title = element.select("h3 a").text()
                    }
                    
                    // 提取作者
                    book.author = element.select(".author").text() ?: "未知作者"
                    
                    // 提取价格
                    def priceText = element.select(".price_color").text()
                    book.price = priceText ? new BigDecimal(priceText.replaceAll("[^\\d.]", "")) : 0.0
                    
                    // 提取评分（假设评分用CSS类表示，如star-rating-Five）
                    def ratingClass = element.select(".star-rating").attr("class")
                    def ratingMatch = ratingClass =~ /star-rating-(\w+)/
                    if (ratingMatch) {
                        def ratingMap = [One:1, Two:2, Three:3, Four:4, Five:5]
                        book.rating = ratingMap[ratingMatch[0][1]] ?: 0
                    }
                    
                    // 提取详情页链接
                    def relativeUrl = element.select("h3 a").attr("href")
                    book.url = baseUrl + relativeUrl
                    
                    // 获取图书详情
                    fetchBookDetails(book)
                    
                    books << book
                    println "已解析: ${book.title}"
                } catch (Exception e) {
                    println "解析单本书时出错: ${e.message}"
                }
            }
        } catch (Exception e) {
            println "解析图书列表时出错: ${e.message}"
        }
        
        return books
    }
    
    /**
     * 获取图书详细信息
     */
    void fetchBookDetails(Book book) {
        try {
            def html = fetchPage(book.url)
            if (html) {
                Document doc = Jsoup.parse(html)
                
                // 提取描述
                def descriptionElement = doc.select("#product_description + p")
                if (descriptionElement) {
                    book.description = descriptionElement.text()
                }
            }
        } catch (Exception e) {
            println "获取图书详情时出错: ${e.message}"
        }
    }
    
    /**
     * 保存结果到JSON文件
     */
    void saveToJson(List<Book> books, String filename) {
        try {
            objectMapper.writeValue(new File(filename), books)
            println "结果已保存到: ${filename}"
        } catch (Exception e) {
            println "保存结果时出错: ${e.message}"
        }
    }
    
    /**
     * 关闭HTTP客户端
     */
    void close() {
        httpClient.close()
    }
}

/**
 * 主程序
 */
def main() {
    // 初始化爬虫
    def scraper = new BookScraper()
    
    try {
        // 要爬取的网站URL（这里以虚构的图书网站为例）
        def baseUrl = "https://books.example.com"
        def startUrl = "${baseUrl}/catalogue/page-1.html"
        
        // 获取网页内容
        def html = scraper.fetchPage(startUrl)
        
        if (html) {
            // 解析图书列表
            def books = scraper.parseBookList(html, baseUrl)
            
            // 打印结果
            println "\n爬取结果:"
            books.eachWithIndex { book, index ->
                println "${index + 1}. ${book}"
                if (book.description) {
                    println "   描述: ${book.description.length() > 100 ? book.description.substring(0, 100) + '...' : book.description}"
                }
                println()
            }
            
            // 保存结果到JSON文件
            scraper.saveToJson(books, "books.json")
            
            println "共爬取 ${books.size()} 本书籍"
        } else {
            println "未能获取网页内容"
        }
    } finally {
        // 确保关闭HTTP客户端
        scraper.close()
    }
}

// 运行主程序
main()

代码说明

这个Groovy爬虫示例具有以下特点：

1、完整的爬虫功能：

发送HTTP请求并处理响应
解析HTML内容提取所需数据
处理异常和错误情况
保存结果到JSON文件

2、使用Groovy的优势：

简洁的语法和强大的集合操作
直接使用Java生态库（Jsoup、HttpClient、Jackson）
灵活的闭包和DSL风格代码
无需编译，可直接运行

3、实用功能：

设置超时和请求头模拟浏览器
错误处理和日志输出
数据清洗和转换
结构化数据存储

运行方法

1、确保已安装Groovy

2、将代码保存为book_scraper.groovy

3、运行命令：groovy book_scraper.groovy

扩展建议

1、添加代理支持以避免IP被封

2、实现分页爬取功能

3、添加并发处理以提高爬取效率

4、集成数据库存储代替文件存储

5、添加定时任务支持定期爬取

这个示例展示了Groovy在爬虫开发中的简洁性和强大功能，特别适合需要快速开发和与Java生态系统集成的项目。

看出来了吧？Groovy搞爬虫真是又狠又灵活！代码写得比Java清爽十倍，底层还能调用所有Java生态的硬核工具库。无论是快速抓数据还是集成到现有Java系统里都特别顺手。不过记得爬数据要讲武德，控制频率加异常处理，毕竟咱不是暴力爬虫。用这招去折腾数据吧，绝对爽到飞起！

原创声明：本文系作者授权腾讯云开发者社区发表，未经许可，不得转载。

如有侵权，请联系 cloudcommunity@tencent.com 删除。

jsoup

java

原创声明：本文系作者授权腾讯云开发者社区发表，未经许可，不得转载。

如有侵权，请联系 cloudcommunity@tencent.com 删除。

登录后参与评论

0 条评论

热度