1.在maven中导入httpClient依赖
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.4</version>
</dependency>
2. 发送请求并接受数据
2.1 发送get请求
package com.vking.httpClient;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
/**
* @author 待你如初见
* @create 2018-09-25 9:59
**/
public class HTTPClientGet {
public static void main(String[] args) throws IOException {
//1. 需要先创建httpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//2. 指定请求方式
HttpGet get = new HttpGet("http://www.jd.com");
//3. 可选的: 封装请求参数
//3.1 封装请求体
get.setHeader("user-agent","Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36");
//4. 发送请求
// CloseableHttpResponse: 封装了响应的所有的内容: 响应行 响应头 响应体
CloseableHttpResponse response = httpClient.execute(get);
//5. 获取数据
//5.1 获取状态码
int statusCode = response.getStatusLine().getStatusCode();
System.out.println(statusCode);
if(statusCode==200){
//获取响应体的数据
//在httpClient工具包中已经提供了获取响应体的快捷的方式
String html = EntityUtils.toString(response.getEntity(), "UTF-8");
System.out.println(html);
}
}
}
2.2 发送post请求
package com.vking.httpClient;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
/**
* @author 待你如初见
* @create 2018-09-25 10:12
**/
public class HttpClientPost {
public static void main(String[] args) throws Exception {
//1. 获取httpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//2. 指定请求方式
HttpPost httpPost = new HttpPost("http://www.jd.com");
//3. 封装请求参数
List<BasicNameValuePair> list = new ArrayList<BasicNameValuePair>();
list.add(new BasicNameValuePair("username","xiaochuan"));
list.add(new BasicNameValuePair("age","48"));
HttpEntity entity = new UrlEncodedFormEntity(list);
httpPost.setEntity(entity);
//3.1 请求体
//4. 执行请求
CloseableHttpResponse response = httpClient.execute(httpPost);
//5. 获取数据
//5.1 获取状态码
int statusCode = response.getStatusLine().getStatusCode();
//5.2 获取响应头的数据
Header[] headers = response.getHeaders("Content-Type");
String value = headers[0].getValue();
System.out.println(statusCode +" "+ value);
//5.3 获取响应体数据
String html = EntityUtils.toString(response.getEntity(), "UTF-8");
System.out.println(html);
}
}
jsoup是一款专门用来在java端来解析HTML的工具包, HTML文档其实就是一个DOM对象, 所以如果要使用jsoup首先需要先获取到文档的Dom对象
1.导入依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
package com.vking.jsoup;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.File;
import java.io.IOException;
/**
* @author 待你如初见
* @create 2018-09-25 10:55
**/
public class JsoupDom {
public static void main(String[] args) throws IOException {
//1. 最常用的一种方式
String html = "<!DOCTYPE html>\n" +
"<html lang=\"en\">\n" +
"<head>\n" +
" <meta charset=\"UTF-8\">\n" +
" <title>获取html文档的第一种方式</title>\n" +
"</head>\n" +
"<body>\n" +
"\n" +
"</body>\n" +
"</html>";
Document document1 = Jsoup.parse(html);
String title = document1.title();
System.out.println(title);
//2. 可以通过发送一个HTTP请求获取dom对象
/* Document document2 = Jsoup.connect("http://www.jd.com").get();
System.out.println(document2);*/
//3. 加载一个外部的HTML文件
//Document document3 = Jsoup.parse(new File(""), "utf-8");
//4. 根据给定的HTML 代码片段来获取dom对象
String html2 = "<a>第四种获取dom对象的方式</a>";
Document document4 = Jsoup.parseBodyFragment(html2);
//Document document4 = Jsoup.parse(html2);
String text = document4.text();
System.out.println(text);
}
}
在jsoup中一共提供了两套API:
jsoup常用的方法:
这里采用多线程加阻塞队列方式多线程和队列的以后补充package com.vking.spider;import com.google.gson.Gson; import com.vking.ProductDao; import com.vking.pojo.Product; import com.vking.utils.HttpClientUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements;import java.io.IOException; import java.util.List; import java.util.Map; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors;/** * @author 待你如初见 * @create 2018-09-25 10:12 **/ public class JDSpider { // ProductDao为是数据库链接方式这里采用Spring jdbcTemple private static ProductDao productDao = new ProductDao(); // 创建容量为一千的队列 private static BlockingQueue<String> queue = new ArrayBlockingQueue<String>(1000); // 创建固定数量线程池 private static ExecutorService threadPool = Executors.newFixedThreadPool(51);public static void main(String[] args) throws Exception { // 用于查看队列剩余 threadPool.execute(new Runnable() { public void run() { while(true) { try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } System.out.println("当前剩余" + queue.size()); } } }); // 优先开启数据写入线程 // 如果先开启page会出现无法写入问题 toothread(); // 开起队列写入 page(); } public static Product skuContent(String skuId) throws IOException { // 创建Product类 Product product = new Product(); // set productid product.setPid(skuId); // 获取商品详情页 String url = "https://item.jd.com/" + skuId + ".html"; // set ProductUrl product.setUrl(url); // 获取商品详情页THML String html = HttpClientUtils.doGet(url); // 使用Jsoup Document document = Jsoup.parse(html); // 获取title Elements skuName = document.select(".sku-name"); product.setTitle(skuName.text()); // 获取品牌 Elements brand = document.select("#parameter-brand>li"); product.setBrand(brand.attr("title")); // 获取商品名 Elements pname = document.select("[class=\"parameter2 p-parameter-list\"] li:first-child"); product.setPname(pname.attr("title")); // 获取价格 // 可以采用获取商品列表页的价格或者商品详情页发送AJAX 方式自行选取这里采用详情页发送AJAX // 价格获取需要AJAX请求所以需要单独发送请求 String priceUrl = "https://p.3.cn/prices/mgets?skuIds=J\_" + skuId; // 获取价格json字符串 String pJSON = HttpClientUtils.doGet(priceUrl); // 这里使用Gosn处理返回的JSON Gson gson = new Gson(); List<Map<String, String>> list = gson.fromJson(pJSON, List.class); String price = list.get(0).get("p"); // set Price product.setPrice(price); return product; } public static void addProduct(Product product) { //存入数据库 productDao.addProducr(product); } public static void page() throws IOException, InterruptedException { // 循环获取100页内容 for (int i = 0; i < 100; i++) { //https://search.jd.com/Search?keyword=iphone%208%20plus&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&ev=exbrand\_Apple%5E&page=3&s=61&click=0 String url = "https://search.jd.com/Search?keyword=手机&enc=utf-8&pvid=3ae312e430f94a798fd95b1a94b9bd4e&page=" + (2 \* i - 1); //获取商品列表的每一页 String html = HttpClientUtils.doGet(url); Thread.sleep(200); Document document = Jsoup.parse(html); Elements lis = document.select("#J\_goodsList>ul>li"); for (Element li : lis) { //获取商品id String sku\_id = li.attr("data-sku"); queue.put(sku\_id);// Product product = skuContent(sku_id); // // System.out.println(product); // // addProduct(product); } // 获取商品价格 另一种价格获取方式 // Elements sku\_price = li.select(".J\_" + sku\_id + ">i"); } } public static void toothread() { //开启50线程写入数据库 for (int i = 0; i < 50; i++) { threadPool.execute(new Runnable() { public void run() { while (true) { try { String sku\_id = queue.take(); Product product = skuContent(sku\_id); System.out.println(product); addProduct(product); } catch (InterruptedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } }); } }}
<build>
<plugins>
<!--这是jdk编译的插件 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>utf-8</encoding>
</configuration>
</plugin>
<!--打包的插件-->
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<archive>
<manifest> <!-- 注意 此为设置程序的主入口-->
<mainClass>com.itheima.spider.JDSpiderMaster</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</plugin>
</plugins>
</build>
package com.vking.jdSpider;
import com.vking.jedis.utils.JedisUtils;
import com.vking.utils.HttpClientUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import redis.clients.jedis.Jedis;
/**
- master程序是用来根据url获取pid的
-
- @author 带你如如初见
- @create 2018-09-25 11:45
**/
public class JdMaster {
private static Jedis jedis = JedisUtils.getJedis();
public static void main(String[] args) throws Exception {
page();
}
public static void page() throws Exception {
for (int i = 1; i <= 100; i++) {
String pageUrl = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&page=" + (2 * i - 1);
// 1. 发起请求, 获取数据
String html = HttpClientUtils.doGet(pageUrl);
// 2. 解析数据
parsePid(html);
}
}
//此方法用来解析pid
private static void parsePid(String html) throws Exception {
//1. 获取dom对象
Document document = Jsoup.parse(html);
// 2. 解析pid
Elements liEl = document.select("#J_goodsList>ul>li");
// 3. 从li中获取pid
for (Element li : liEl) {
String pid = li.attr("data-pid");
/* Product product = parseProduct(pid);
System.out.println(product);
//4. 数据保存的操作
productDao.addProduct(product);*/
//将pid填充到阻塞队列中
//queue.put(pid);
//将pid保存到redis中
jedis.lpush("bigData:jdSpider:pid", pid);
//jedis.close();
}
}
}
import com.google.gson.Gson;
// 存入数据库类
import com.vking.dao.ProductDao;
// jedis工具类
import com.vking.jedis.utils.JedisUtils;
// pojo product类
import com.vking.pojo.Product;
// httpClient工具类
import com.vking.utils.HttpClientUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import redis.clients.jedis.Jedis;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
- slave程序主要 用来获取pid, 解析商品数据
- @author 待你如初见
- @create 2018-09-25 11:51
public class JdSlave {
private static ExecutorService threadPool = Executors.newFixedThreadPool(31);
private static ProductDao productDao = new ProductDao();
public static void main(String[] args) {
tooThread();
}
public static void tooThread(){
for(int i = 0 ; i<30 ; i++){
threadPool.execute(new Runnable() {
public void run() {
// 1. 从队列中获取数据
while(true) {
try {
// String pid = queue.take();
// 从redis中获取pid
Jedis jedis = JedisUtils.getJedis();
List<String> list = jedis.brpop(0, "bigData:jdSpider:pid");
jedis.close();
//2. 根据pid获取数据
Product product = parseProduct(list.get(1));
//3. 保存数据
productDao.addProduct(product);
} catch (Exception e) {
e.printStackTrace();
}
}
}
});
}
}
private static Product parseProduct(String pid) throws IOException {
Product product = new Product();
// 1. 拼接url
String pUrl = "https://item.jd.com/" + pid + ".html";
// 2. 发起请求, 获取商品详情页的数据
String html = HttpClientUtils.doGet(pUrl);
// 3. 解析商品详情页
// 3.1 获取dom对象
Document document = Jsoup.parse(html);
// 3.2 商品的标题
Elements titleEl = document.select(".sku-name");
product.setTitle(titleEl.text());
// 3.3 商品的品牌
Elements liEl = document.select("#parameter-brand>li");
product.setBrand(liEl.attr("title"));
// 3.4 商品的名称
Elements liPnameEl = document.select("[class=parameter2 p-parameter-list] li:first-child");
product.setPname(liPnameEl.attr("title"));
// 3.5 封装pid 和 url
product.setPid(pid);
product.setUrl(pUrl);
// 3.6 获取价格
String priceUrl = "https://p.3.cn/prices/mgets?skuIds=J_" + pid;
// 发起请求获取数据
String pirceJSON = HttpClientUtils.doGet(priceUrl);
// 如何判断一个json字符串是数组还是对象: 最需要查看最外围的符号即可, 如果是[] 那么就是数组, 如果是{}那就是对象
// gson和fastJson功能一样, gson是谷歌公司提供的一个json的转换工具
Gson gson = new Gson();
List<Map<String, String>> list = gson.fromJson(pirceJSON, List.class);
Map<String, String> map = list.get(0);
String price = map.get("p");
product.setPrice(price);
return product;
}
}
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
扫码关注腾讯云开发者
领取腾讯云代金券
Copyright © 2013 - 2025 Tencent Cloud. All Rights Reserved. 腾讯云 版权所有
深圳市腾讯计算机系统有限公司 ICP备案/许可证号:粤B2-20090059 深公网安备号 44030502008569
腾讯云计算(北京)有限责任公司 京ICP证150476号 | 京ICP备11018762号 | 京公网安备号11010802020287
Copyright © 2013 - 2025 Tencent Cloud.
All Rights Reserved. 腾讯云 版权所有