前面笔记已经涉及很多,不再作过多赘述,有此文章三篇,叙述极为详实,以做参考:
基础选择器:
属性选择器:
组合选择器:
伪选择器:
前面笔记已经涉及很多,不再作过多赘述,有此文章一篇,叙述极为详实,以做参考:
前面笔记已经涉及很多,不再作过多赘述,前面笔记如下:
<!-- https://mvnrepository.com/artifact/cn.wanghaomiao/JsoupXpath -->
<dependency>
<groupId>cn.wanghaomiao</groupId>
<artifactId>JsoupXpath</artifactId>
<version>2.3.2</version>
</dependency>
package com.qian.test;
import java.util.List;
import org.seimicrawler.xpath.JXDocument;
import org.seimicrawler.xpath.JXNode;
public class JsoupXpathTest1 {
public static void main(String[] args) {
//基于URL创建JXDocument
JXDocument jxd = JXDocument.createByUrl("http://www.w3school.com.cn/b.asp");
//Xpath语句
String str = "//*[@id='course']/ul/li/a";
//获取节点集合
List<JXNode> list = jxd.selN(str);
//遍历节点
for (int i = 0; i < list.size(); i++) {
JXNode node = list.get(i);
System.out.println("标题为:" + node.asElement().text() +
"\tURL为:" + node.asElement().attr("href"));
}
}
}
HtmlCleaner是另外一款基于Java开发的HTML文档解析器,支持Xpath语法提取HTML中的节点和元素;
<!-- https://mvnrepository.com/artifact/net.sourceforge.htmlcleaner/htmlcleaner -->
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
<version>2.24</version>
</dependency>
使用HtmlCleaner解析HTML文档时,需要使用到两个类:org.htmlcleaner.HtmlCleaner以及org.htmlcleaner.TagNode。HtmlCleaner类提供了实例化HtmlCleaner的方法和将指定类型输入(如String类型的HTML字符串、String类型的URL等)转化成节点(TagNode)的方法,如下表为部分方法和说明:
TagNode类提供了一系列操作节点的方法,如下表所示:
package com.crawler.htmlcleaner;
import java.io.IOException;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class HtmlcleanerTest1 {
public static void main(String[] args) throws IOException, XPatherException {
//使用Jsoup获取html文件
Document doc = Jsoup.connect("http://www.********.com.cn/b.asp").
timeout(5000).get();
//转化成String格式
String html =doc.html();
//实例化HtmlCleaner
HtmlCleaner cleaner = new HtmlCleaner();
//转化成TagNode
TagNode node = cleaner.clean(html);
//通过Xpath定位标题的位置,这里使用//h1和/h1的结果是一样的
Object[] ns = node.evaluateXPath("//div[@id='********']//h1");
System.out.println("HTML中的标题是:\t" + ((TagNode)ns[0]).getText());
Object[] ns1 = node.evaluateXPath("//*[@id='********']/h1");
System.out.println("HTML中的标题是:\t" + ((TagNode)ns1[0]).getText());
//这里使用//a表示不考虑位置,如果使用/a获取不到内容
Object[] ns2 = node.evaluateXPath("//*[@id='course']/ul//a");
for(Object on : ns2) { //遍历获取课程名以及课程地址
TagNode n = (TagNode) on;
System.out.println("课程名为:\t" + n.getText() +
"\t地址为:\t" + n.getAttributeByName("href"));
}
//获取每个课程名称以及其对应的简介
Object[] ns3 = node.evaluateXPath("//*[@id='maincontent']//div");
for (int i = 1; i < ns3.length; i++) {
TagNode n = (TagNode) ns3[i];
//获取课程名称
String courseName = n.findElementByName("h2", true).getText().toString();
//循环遍历所有的p节点获取课程简介
Object[] objarrtr = n.evaluateXPath("//p");
String summary = "";
for(Object on : objarrtr) {
summary += ((TagNode) on).getText().toString();
}
System.out.println(courseName + "\t" + summary);
}
}
}
HTMLParser也是一款非常高效的HTML解析器,其支持CSS选择器提取HTML中的节点。HTMLParser的版本已经不再更新,但并不影响使用;
<!-- https://mvnrepository.com/artifact/org.htmlparser/htmlparser -->
<dependency>
<groupId>org.htmlparser</groupId>
<artifactId>htmlparser</artifactId>
<version>2.1</version>
</dependency>
package com.crawler.htmlparser;
import java.io.IOException;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class HTMLParserTest1 {
public static void main(String[] args) throws IOException, ParserException {
//使用Jsoup获取html文件
Document doc = Jsoup.connect("http://www.********.com.cn/b.asp")
.timeout(5000).get();
//转化成String格式
String html =doc.html();
//使用Lexer构造
Lexer lexer = new Lexer(html);
Parser parser = new Parser(lexer);
//过滤页面中的链接标签
NodeFilter filter = new NodeClassFilter(LinkTag.class);
//获取匹配到的节点
NodeList list = parser.extractAllNodesThatMatch(filter);
//遍历每一个节点,获取链接以及标题
for(int i=0; i<list.size();i++){
Node node = (Node)list.elementAt(i);
System.out.println("链接为:" + ((LinkTag) node).getLink()
+ "\t标题为:" + node.toPlainTextString() );
}
}
}
package com.crawler.htmlparser;
import java.io.IOException;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasParentFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class HTMLParserTest2 {
public static void main(String[] args) throws IOException, ParserException {
//实例化Parser,用网页的 url 作为参数
Parser parser = new Parser("http://www.********.com.cn/b.asp");
//设置网页的编码(GBK)
parser.setEncoding("gbk");
//过滤页面中的标签
NodeFilter filtertag= new TagNameFilter("ul");
//父节点包含ul
NodeFilter filterParent = new HasParentFilter(filtertag);
//包含li标签,并且li节点中包含id属性
NodeFilter filtername = new TagNameFilter("li");
NodeFilter filterId= new HasAttributeFilter("id");
//过滤器的并操作
NodeFilter filter = new AndFilter(filterParent,filtername);
NodeFilter filterfinal = new AndFilter(filter,filterId);
//选择匹配到的内容
NodeList list = parser.extractAllNodesThatMatch(filterfinal);
//循环遍历
for(int i=0; i<list.size();i++){
//获取li的第一个子节点
Node node = (Node)list.elementAt(i).getFirstChild();
System.out.println( "链接为:" + ((LinkTag) node).getLink()
+"\t标题为:" + node.toPlainTextString() );
}
}
}
package com.crawler.htmlparser;
import java.io.IOException;
import java.net.URL;
import java.net.URLConnection;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.filters.CssSelectorNodeFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class HTMLParserTest3 {
public static void main(String[] args) throws IOException, ParserException {
//使用URLConnection请求数据
URL url = new URL("http://www.********.com.cn/b.asp");
URLConnection conn = url.openConnection();
Parser parser = new Parser(conn);
//css选择器进行过滤操作
CssSelectorNodeFilter divFilter=new CssSelectorNodeFilter ("#course > ul > li");
//选择匹配到的内容
NodeList list = parser.extractAllNodesThatMatch(divFilter);
//循环遍历
for(int i=0; i<list.size();i++){
//获取li的第一个子节点
Node node = (Node)list.elementAt(i).getFirstChild();
System.out.println( "链接为:" + ((LinkTag) node).getLink()
+"\t标题为:" + node.toPlainTextString() );
}
}
}
package com.crawler.htmlparser;
import java.io.IOException;
import java.net.URL;
import java.net.URLConnection;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class HTMLParserTest4 {
public static void main(String[] args) throws IOException, ParserException {
//使用URLConnection请求数据
URL url = new URL("http://www.********.com.cn/b.asp");
URLConnection conn = url.openConnection();
Parser parser = new Parser(conn);
NodeFilter filtername = new TagNameFilter("li"); //选择的节点为每个li
NodeFilter filter = new HasChildFilter(filtername);
NodeList nodes = parser.extractAllNodesThatMatch(filter);
//循环遍历
for(int i=0; i<nodes.size();i++){
//获取li的第一个子节点
Node node = (Node)nodes.elementAt(i).getFirstChild();
System.out.println( node.toPlainTextString() );
}
}
}
Jsoup既可以解析HTML,也可以解析XML,且XML与HTML极为相似,故仅在此进行代码演示,以为参考;
package com.crawler.xml;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class JsoupXML {
public static void main(String[] args) throws IOException {
//获取URL对应的HTML内容
String url = "http://db.auto.****.com/cxdata/xml/sales/model/model1001sales.xml";
Document doc = Jsoup.connect(url).timeout(5000).get();
//Jsoup选择器解析
Elements sales_ele = doc.select("sales");
for (Element elem:sales_ele) {
int salesnum=Integer.valueOf(elem.attr("salesnum"));
String date = elem.attr("date");
System.out.println("月份:" + date + "\t销量:" + salesnum);
}
}
}