我们进行ETL(Extract-Transfer-Load) 过程中,经常会遇到从不同数据源获取的不同格式的数据,其中某些字段就是json格式,里面拼接了很多字段key和指标值value,今天讲一下如何解析出来相关数据。
select
get_json_object('{"user_name":"chimchim","age":30}', '$.user_name') as user_name,
get_json_object('{"user_name":"chimchim","age":30}', '$.age') as age
select json_tuple('{"user_name":"chimchim","age":30,"sex":"woman"}', 'user_name', 'age','sex')
select json_tuple(json, 'user_name', 'age', 'sex')
from (
select explode( --将json数组中的元素解析出来,转化为每行显示
split(regexp_replace(regexp_replace(
'[{"user_name":"chimchim","age":30,"sex":"woman"},{"user_name":"zonzon","age":2,"sex":"man"}]' --要解析的json内容
, '\\[|\\]', '') --将json数组两边的中括号去掉
,'\\}\\,\\{', '\\}\\;\\{') --将json数组元素之间的逗号换成分号
, '\\;')) --以分号作为分隔符(split函数以分号作为分隔)
as json) o;
explode函数
select array('A','B','C') ;
regexp_replace函数
--将,替换为;
select regexp_replace('{"user_name":"chimchim","age":30,"sex":"woman"}', ',', ';');
lateral view
说明:lateral view用于和split、explode等UDTF一起使用的,能将一行数据拆分成多行数据,在此基础上可以对拆分的数据进行聚合,lateral view首先为原始表的每行调用UDTF,UDTF会把一行拆分成一行或者多行,lateral view在把结果组合,产生一个支持别名表的虚拟表。
原始数据
select 'chimchim' as user_name,array("a","b","c") as class;
解析后
select user_name,class_str
from (select 'chimchim' as user_name,array("a","b","c") as class) a
lateral view explode(class) tmp_table as class_str;
使用 lateral view 解析json数组
--第一种写法
select
get_json_object(tmp,'$.user_name') as user_name
,get_json_object(tmp,'$.age') as age
,get_json_object(tmp,'$.sex') as sex
from (select '[{"user_name":"chimchim","age":30,"sex":"woman"},{"user_name":"zonzon","age":2,"sex":"man"}]' as json_str) a
lateral view explode(split(regexp_replace(regexp_replace(json_str , '\\[|\\]',''),'\\}\\,\\{','\\}\\;\\{'),'\\;')) tmp as tmp;
--第二种写法
select user_name,age,sex
from (
select '[{"user_name":"chimchim","age":30,"sex":"woman"},{"user_name":"zonzon","age":2,"sex":"man"}]' as json
) t
lateral view explode(split(regexp_replace(regexp_replace(regexp_replace(json, '\\[|\\]',''),'\\s',''),'\\}\\,\\{','\\}\\;\\{'),'\\;')) tmp1 as regexp_json
lateral view json_tuple(regexp_json,'user_name','age','sex') tmp2 as user_name,age,sex
;
select
json
,regrep_json
,split(regrep_json,':')[0] as key1
,split(regrep_json,':')[1] as value1
from (
select '{"a":0.1,"b":0.2,"c":0.3,"d":0.4,"e":0.5,"f":0.6,"g":0.7}' as json
) t
lateral view explode(split(regexp_replace(regexp_replace(json, '\\{|\\}|\\"',''),'\\s',''),'\\,')) tmp1 as regrep_json
;