前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >JSON综合性复杂案例

JSON综合性复杂案例

作者头像
编程那点事
发布2023-02-25 15:55:32
5150
发布2023-02-25 15:55:32
举报
文章被收录于专栏:java编程那点事

查询成绩为80分以上的学生的基本信息与成绩信息 Student.json {"name":"Leo", "score":85} {"name":"Marry", "score":99} {"name":"Jack", "score":74}

代码语言:javascript
复制
/**
  * JSON数据源
  * @author Administrator
  *
  */
public class JSONDataSource {

​public static void main(String[] args) {
​​SparkConf conf = new SparkConf()​​​.setAppName("JSONDataSource");  
​​JavaSparkContext sc = new JavaSparkContext(conf);
​​SQLContext sqlContext = new SQLContext(sc);
​​// 针对json文件,创建DataFrame(针对json文件创建DataFrame)
​​DataFrame studentScoresDF = sqlContext.read().json​​​​"hdfs://spark1:9000/spark-study/students.json");  
// 针对学生成绩信息的DataFrame,注册临时表,查询分数大于80分的学生的姓名
​​// (注册临时表,针对临时表执行sql语句)
​​studentScoresDF.registerTempTable("student_scores");
​​DataFrame goodStudentScoresDF = sqlContext.sql(​​​​"select name,score from student_scores where score>=80");
// (将DataFrame转换为rdd,执行transformation操作)
​​List<String> goodStudentNames = goodStudentScoresDF.javaRDD().map(

new Function<Row, String>() {

​​​​​private static final long serialVersionUID = 1L;

​​​​​@Override
​​​​​public String call(Row row) throws Exception {
​​​​​​return row.getString(0);
​​​​​}
​​​​}).collect();

​​// 然后针对JavaRDD<String>,创建DataFrame
​​// (针对包含json串的JavaRDD,创建DataFrame)
​​List<String> studentInfoJSONs = new ArrayList<String>();
​​studentInfoJSONs.add("{\"name\":\"Leo\", \"age\":18}");  
​​studentInfoJSONs.add("{\"name\":\"Marry\", \"age\":17}");  
​​studentInfoJSONs.add("{\"name\":\"Jack\", \"age\":19}");
​​JavaRDD<String> studentInfoJSONsRDD = sc.parallelize(studentInfoJSONs);
​​DataFrame studentInfosDF = sqlContext.read().json(studentInfoJSONsRDD);
​​// 针对学生基本信息DataFrame,注册临时表,然后查询分数大于80分的学生的基本信息
​​studentInfosDF.registerTempTable("student_infos");  
​​String sql = "select name,age from student_infos where name in (";        
for(int i = 0; i < goodStudentNames.size(); i++) {
​​​sql += "'" + goodStudentNames.get(i) + "'";
​​​if(i < goodStudentNames.size() - 1) {
​​​​sql += ",";
​​​}
​​}
​​sql += ")";

​​DataFrame goodStudentInfosDF = sqlContext.sql(sql);
​​// 然后将两份数据的DataFrame,转换为JavaPairRDD,执行join transformation
​​// (将DataFrame转换为JavaRDD,再map为JavaPairRDD,然后进行join)
​​JavaPairRDD<String, Tuple2<Integer, Integer>> goodStudentsRDD = ​​​​goodStudentScoresDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {

private static final long serialVersionUID = 1L;

​​​​​@Override
​​​​​public Tuple2<String, Integer> call(Row row) throws Exception {
​​​​​​return new Tuple2<String, Integer>(row.getString(0),
​​​​​​​​Integer.valueOf(String.valueOf(row.getLong(1))));  
​​​​​}
​​​​}).join(goodStudentInfosDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() {

private static final long serialVersionUID = 1L;

​​​​​@Override
​​​​​public Tuple2<String, Integer> call(Row row) throws Exception {
​​​​​​return new Tuple2<String, Integer>(row.getString(0),
​​​​​​​​Integer.valueOf(String.valueOf(row.getLong(1))));  
​​​​​}
​​​​}));

// 然后将封装在RDD中的好学生的全部信息,转换为一个JavaRDD<Row>的格式
​​// (将JavaRDD,转换为DataFrame)
​​JavaRDD<Row> goodStudentRowsRDD = goodStudentsRDD.map(

​​​​new Function<Tuple2<String,Tuple2<Integer,Integer>>, Row>() {

​​​​​private static final long serialVersionUID = 1L;

​​​​​@Override
​​​​​public Row call(
​​​​​​​Tuple2<String, Tuple2<Integer, Integer>> tuple) ​​​​​​​throws Exception {
​​​​​​return RowFactory.create(tuple._1, tuple._2._1, tuple._2._2);
​​​​​}
​​​​});

​​// 创建一份元数据,将JavaRDD<Row>转换为DataFrame
​​List<StructField> structFields = new ArrayList<StructField>();
​​structFields.add(DataTypes.createStructField("name", DataTypes.StringType, true));
​​structFields.add(DataTypes.createStructField("score", DataTypes.IntegerType, true));  
​​structFields.add(DataTypes.createStructField("age", DataTypes.IntegerType, true));  
​​StructType structType = DataTypes.createStructType(structFields);  
​​DataFrame goodStudentsDF = sqlContext.createDataFrame(goodStudentRowsRDD, structType);

// 将好学生的全部信息保存到一个json文件中去
// (将DataFrame中的数据保存到外部的json文件中去)         
goodStudentsDF.write().format("json").save("hdfs://spark1:9000/spark-study/good-students");  
​}
}

查看结果: Hadoop fs –text /spark-study/good-students/part-r*

Scala版本

代码语言:javascript
复制
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.LongType


/**
* @author Administrator
*/
object JSONDataSource {

def main(args: Array[String]): Unit = {
val conf = new SparkConf()
   .setAppName("JSONDataSource")  
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)

// 创建学生成绩DataFrame
val studentScoresDF = sqlContext.read.json("hdfs://spark1:9000/spark-study/students.json")

// 查询出分数大于80分的学生成绩信息,以及学生姓名
studentScoresDF.registerTempTable("student_scores")
val goodStudentScoresDF = sqlContext.sql("select name,score from student_scores where score>=80")
val goodStudentNames = goodStudentScoresDF.rdd.map { row => row(0) }.collect()  IDEa

// 创建学生基本信息DataFrame
val studentInfoJSONs = Array("{\"name\":\"Leo\", \"age\":18}",
   "{\"name\":\"Marry\", \"age\":17}",
   "{\"name\":\"Jack\", \"age\":19}")
val studentInfoJSONsRDD = sc.parallelize(studentInfoJSONs, 3);
val studentInfosDF = sqlContext.read.json(studentInfoJSONsRDD)  

// 查询分数大于80分的学生的基本信息
studentInfosDF.registerTempTable("student_infos")

var sql = "select name,age from student_infos where name in ("
for(i <- 0 until goodStudentNames.length) {
 sql += "'" + goodStudentNames(i) + "'"
 if(i < goodStudentNames.length - 1) {
   sql += ","
 }
}
sql += ")"  

val goodStudentInfosDF = sqlContext.sql(sql)

// 将分数大于80分的学生的成绩信息与基本信息进行join
val goodStudentsRDD =
   goodStudentScoresDF.rdd.map { row => (row.getAs[String]("name"), row.getAs[Long]("score")) }
       .join(goodStudentInfosDF.rdd.map { row => (row.getAs[String]("name"), row.getAs[Long]("age")) })  

// 将rdd转换为dataframe
val goodStudentRowsRDD = goodStudentsRDD.map(
   info => Row(info._1, info._2._1.toInt, info._2._2.toInt))  
       
val structType = StructType(Array(
   StructField("name", StringType, true),
   StructField("score", IntegerType, true),
   StructField("age", IntegerType, true)))  
   
val goodStudentsDF = sqlContext.createDataFrame(goodStudentRowsRDD, structType)  

// 将dataframe中的数据保存到json中
goodStudentsDF.write.format("json").save("hdfs://spark1:9000/spark-study/good-students-scala")  
 }

}
本文参与 腾讯云自媒体同步曝光计划,分享自作者个人站点/博客。
原始发表:2019-02-22,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档