“
Higher Order Functions - Transform
”可以用来根据字典关联排列“
col
”中的元素,并进行排序以获取排名最低的元素。
from pyspark.sql import functions as F
from itertools import chain
data = [(1, "A,B,C",),
(2, "D,C",),
(3, "B,C,A",),
(4, None,), ]
df = spark.createDataFrame(data, ("id", "col", ))
d = {'A': 1, 'B': 2, 'C': 3, 'D': 4}
mapper = F.create_map([F.lit(c) for c in chain.from_iterable(d.items())])
"""
Mapper has the value Column<'map(A, 1, B, 2, C, 3, D, 4)'>
"""
(df.withColumn("col", F.split(F.col("col"), ","))
.withColumn("mapper", mapper)
.withColumn("col", F.expr("transform(col, x -> struct(mapper[x] as rank, x as col))"))
.withColumn("col", F.array_min(F.col("col")).col)
).select("id", "col").show()
"""
+---+----+
| id| col|
+---+----+
| 1| A|
| 2| C|
| 3| A|
| 4|null|
+---+----+
"""
array_min
函数来代替排序并获取数组的第一个元素。 - blackbishop