sudo rz -E
命令上传至/opt/module/pig/data
目录hadoop fs -mkdir /pig
hadoop fs -chmod -R 777 /pig
hadoop fs -put /opt/module/pig/data/Processed_Beijing_Bus_Info.csv /pig
hadoop fs -ls /pig
命令查看pig -x mapreduce
# 注册 piggybank.jar 库,以便使用其自定义函数
REGISTER '/opt/module/pig/lib/piggybank.jar';
# 加载北京公交信息的 CSV 文件,并指定字段及其类型
bus_info = LOAD '/pig/Processed_Beijing_Bus_Info.csv' USING PigStorage(',') AS (
bus_name:chararray,
bus_type:chararray,
bus_time:chararray,
tieck:chararray,
licheng:chararray,
gongsi:chararray,
gengxin:chararray,
wang_info:chararray,
wang_buff:chararray,
fan_info:chararray,
fan_buff:chararray
);
# 输出加载的数据以供检查
DUMP bus_info;
# 通过 DISTINCT 操作去重数据
distinct_data = DISTINCT bus_info;
# 输出去重后的数据以供检查
DUMP distinct_data;
# 过滤掉包含空值的记录
filter_data = FILTER distinct_data BY
bus_name != '' AND
bus_type != '' AND
bus_time != '' AND
tieck != '' AND
licheng != '' AND
gongsi != '' AND
gengxin != '' AND
wang_info != '' AND
wang_buff != '' AND
fan_info != '' AND
fan_buff != '';
# 输出过滤后的数据
DUMP filter_data;
# 将过滤后的数据存储到 HDFS 指定路径
# 使用 PigStorage(',') 以逗号为分隔符存储数据
STORE filter_data INTO 'hdfs://master:8020/pig_output' USING PigStorage(',');
hadoop fs -cat /pig_output/part-r-00000
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。