在Jsoup中,可以通过使用连接池来提高爬取网页的效率和性能。连接池可以重复使用已经建立好的网络连接,避免频繁地创建和关闭连接,从而减少资源消耗和网络延迟。
要在Jsoup中使用连接池,可以借助第三方库如Apache HttpClient来实现。下面是一种实现方式:
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>
</dependencies>
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.config.ConnectionConfig;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.ConnectionKeepAliveStrategy;
import org.apache.http.conn.DnsResolver;
import org.apache.http.conn.HttpConnectionFactory;
import org.apache.http.conn.ManagedHttpClientConnectionFactory;
import org.apache.http.conn.routing.HttpRoutePlanner;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.LayeredConnectionSocketFactory;
import org.apache.http.conn.ssl.NoopHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustSelfSignedStrategy;
import org.apache.http.impl.DefaultConnectionReuseStrategy;
import org.apache.http.impl.NoConnectionReuseStrategy;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.DefaultHttpResponseParserFactory;
import org.apache.http.impl.conn.DefaultHttpResponseParserFactory;
import org.apache.http.impl.conn.ManagedHttpClientConnectionFactory;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.protocol.HttpContext;
import org.apache.http.ssl.SSLContextBuilder;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLException;
import javax.net.ssl.SSLSession;
import java.io.IOException;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.nio.charset.StandardCharsets;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.concurrent.TimeUnit;
public class JsoupConnectionPoolManager {
private static final int DEFAULT_MAX_TOTAL_CONNECTIONS = 200; // 最大连接数
private static final int DEFAULT_MAX_PER_ROUTE_CONNECTIONS = 20; // 每个路由的最大连接数
private static final int DEFAULT_CONNECT_TIMEOUT = 5000; // 连接超时时间
private static final int DEFAULT_READ_TIMEOUT = 10000; // 读取超时时间
private PoolingHttpClientConnectionManager connectionManager;
private CloseableHttpClient httpClient;
public JsoupConnectionPoolManager() {
ConnectionSocketFactory plainsf = PlainConnectionSocketFactory.getSocketFactory();
LayeredConnectionSocketFactory sslsf = null;
try {
sslsf = new SSLConnectionSocketFactory(SSLContextBuilder.create().loadTrustMaterial(new TrustSelfSignedStrategy()).build(), NoopHostnameVerifier.INSTANCE);
} catch (NoSuchAlgorithmException | KeyStoreException | KeyManagementException e) {
e.printStackTrace();
}
Registry<ConnectionSocketFactory> registry = RegistryBuilder.<ConnectionSocketFactory>create()
.register("http", plainsf)
.register("https", sslsf)
.build();
connectionManager = new PoolingHttpClientConnectionManager(registry);
connectionManager.setMaxTotal(DEFAULT_MAX_TOTAL_CONNECTIONS);
connectionManager.setDefaultMaxPerRoute(DEFAULT_MAX_PER_ROUTE_CONNECTIONS);
connectionManager.setValidateAfterInactivity(1000);
ConnectionConfig connectionConfig = ConnectionConfig.custom()
.setCharset(StandardCharsets.UTF_8)
.build();
connectionManager.setDefaultConnectionConfig(connectionConfig);
RequestConfig defaultRequestConfig = RequestConfig.custom()
.setConnectTimeout(DEFAULT_CONNECT_TIMEOUT)
.setSocketTimeout(DEFAULT_READ_TIMEOUT)
.setConnectionRequestTimeout(DEFAULT_CONNECT_TIMEOUT)
.build();
httpClient = HttpClients.custom()
.setConnectionManager(connectionManager)
.setConnectionManagerShared(true)
.evictIdleConnections(60, TimeUnit.SECONDS)
.evictExpiredConnections()
.setConnectionTimeToLive(60, TimeUnit.SECONDS)
.setDefaultRequestConfig(defaultRequestConfig)
.setConnectionReuseStrategy(DefaultConnectionReuseStrategy.INSTANCE)
.setKeepAliveStrategy(DefaultConnectionKeepAliveStrategy.INSTANCE)
.setRetryHandler(new DefaultHttpRequestRetryHandler(3, true))
.build();
}
public CloseableHttpClient getHttpClient() {
return httpClient;
}
}
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
public class Main {
public static void main(String[] args) {
String url = "https://example.com";
JsoupConnectionPoolManager connectionPoolManager = new JsoupConnectionPoolManager();
try {
Document doc = Jsoup.connect(url)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36")
.timeout(5000)
.get();
System.out.println(doc.title());
} catch (IOException e) {
e.printStackTrace();
}
}
}
以上代码示例中,通过JsoupConnectionPoolManager
创建了一个连接池管理器,并通过getHttpClient()
方法获取了一个可重用的HttpClient实例。然后可以使用Jsoup.connect(url)
来构建Jsoup连接,并在连接上设置一些参数(如User-Agent、超时时间等)。最后调用.get()
方法来获取网页内容。
使用连接池可以提高爬取网页的效率和性能,同时也能避免对目标网站产生过大的访问压力。在使用连接池时,需要注意合理配置连接池的最大连接数和每个路由的最大连接数,以及适当设置超时时间等参数,根据实际情况调整。
领取专属 10元无门槛券
手把手带您无忧上云