我怎样才能驱动一个列的基础上熊猫-udf在火星雨。我编写了以下udf:
from pyspark.sql.functions import pandas_udf, PandasUDFType
@pandas_udf("in_type string, in_var string, in_numer int", PandasUDFType.GROUPED_MAP)
def getSplitOP(in_data):
if in_data is None or len(in_data) < 1:
return None
#Input/variable.12-2017
splt=in_data.split("/",1)
in_type=splt[0]
splt_1=splt[1].split(".",1)
in_var = splt_1[0]
splt_2=splt_1[1].split("-",1)
in_numer=int(splt_2[0])
return (in_type, in_var, in_numer)
#Expected output: ("input", "variable", 12)
df = df.withColumn("splt_col", getSplitOP(df.In_data))
有人能帮我找出上面的代码有什么问题,以及为什么它不起作用吗?
发布于 2020-02-18 22:16:41
这将起作用:
df = spark.createDataFrame([("input/variable.12-2017",), ("output/invariable.11-2018",)], ("in_data",))
df.show()
from pyspark.sql.functions import pandas_udf, PandasUDFType
@pandas_udf("in_type string, in_var string, in_numer int", PandasUDFType.GROUPED_MAP)
def getSplitOP(pdf):
in_data = pdf.in_data
#Input/variable.12-2017
splt = in_data.apply(lambda x: x.split("/",1))
in_type = splt.apply(lambda x: x[0])
splt_1 = splt.apply(lambda x: x[1].split(".",1))
in_var = splt_1.apply(lambda x: x[0])
splt_2 = splt_1.apply(lambda x: x[1].split("-",1))
in_numer = splt_2.apply(lambda x: int(x[0]))
return pd.DataFrame({"in_type": in_type, "in_var": in_var, "in_numer": in_numer})
#Expected output: ("input", "variable", 12)
df = df.groupBy().apply(getSplitOP)
df.show()
apply
对每个系列按元素进行操作。https://stackoverflow.com/questions/54831667
复制相似问题