所有上述方法都很好,我认为dropduplicates
是最好的方法。
以下是另一种方法(使用group by agg等)在不使用dropduplicates
的情况下去重,
但如果你注意时间/性能,按列dropduplicates
是最佳选择(时间:1563毫秒)。
以下是完整列表和时间。
import org.apache.spark.sql.SparkSession
object DropDups {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("ReadFromUrl")
.master("local[*]")
.getOrCreate()
val sc = spark.sparkContext
import spark.implicits._
spark.sparkContext.setLogLevel("Error")
val data = sc.parallelize(List(
("Foo", 41, "US", 3),
("Foo", 39, "UK", 1),
("Bar", 57, "CA", 2),
("Bar", 72, "CA", 2),
("Baz", 22, "US", 6),
("Baz", 36, "US", 6)
)).toDF("x", "y", "z", "count")
spark.time
{
import org.apache.spark.sql.functions.first
val data = sc.parallelize(List(
("Foo", 41, "US", 3),
("Foo", 39, "UK", 1),
("Bar", 57, "CA", 2),
("Bar", 72, "CA", 2),
("Baz", 22, "US", 6),
("Baz", 36, "US", 6)
)).toDF("x", "y", "z", "count")
val deduped = data
.groupBy("x", "count")
.agg(
first("y").as("y"),
first("z").as("z")
)
deduped.show()
}
spark.time {
data.dropDuplicates(Array("x","count")).show()
}
spark.stop()
}
}
结果:
+---+-----+---+---+
| x|count| y| z|
+---+-----+---+---+
|Baz| 6| 22| US|
|Foo| 1| 39| UK|
|Bar| 2| 57| CA|
|Foo| 3| 41| US|
+---+-----+---+---+
Time taken: 7086 ms
+---+---+---+-----+
| x| y| z|count|
+---+---+---+-----+
|Baz| 22| US| 6|
|Foo| 39| UK| 1|
|Bar| 57| CA| 2|
|Foo| 41| US| 3|
+---+---+---+-----+
Time taken: 1563 ms