我有一个Spark数据框
<video controls id="myVideo" poster="test.jpg">
<source src="test.mp4" type="video/mp4" />
</video>
<script>
let myVideo = document.getElementById("myVideo");
myVideo.onended = function() {
myVideo.poster = "test.jpg"
myVideo.src = "test.mp4"
};
</script>
我想从此列中删除不必要的逗号。因此,例如,第一条记录应在输出中显示为val df = Seq(
(",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, "),
(",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, "),
(",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, "),
(",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,160,,162,,,,,,,,,,,,174,,176,,,,,,,,,,,,, "),
(",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, "),
(",,,,104,,,,,,,111,,,,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,160,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, "),
(",,,103,104,,,,,,,111,,,114,,,,,,,121,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, "),
(",101,102,,104,,,,,,,,,113,114,,,,,,,,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,,,162,,,,,,,,,,,,174,,,,,,,,,,184,,,,, "),
(",,,,104,,,,,,,,,113,,,,,,,,,,,,,,,,,,131,,,,,,,,,,141,142,143,,,146,,,,150,,,,,155,,157,,,,,162,,,,,,,169,,,,,174,,176,177,178,,,,,,,,,,, "),
(",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,,,,,,174,,176,,,,,,,,,,,,, "),
(",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, "),
(",,102,,104,,,,,,,,,113,114,,,,,,,,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,,,162,,,,,,,,,,,,174,,,,,,,,,,,,,,, "),
(",,,,104,,,,,,,111,112,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, "),
(",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, "),
(",,102,103,104,,,,,,,,,113,114,,,,,,,121,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,160,,162,,,,,,,,,,,173,174,,176,,178,,,,,,,,,,,"),
(",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,,,,,,174,,176,,,,,,,,,,,,, "),
(",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, "),
(",,,103,104,,,,,,,111,,,114,,,,,,,121,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, "),
(",,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, "),
(",,102,,104,,,,,,,,,113,114,,,,,,,,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,,,162,,,,,,,,,,,,174,,,,,,,,,,,,,,, ")
).toDF("my_col")
。
该字符串不必以逗号开头和结尾。
如何在Spark中执行此操作?
答案 0 :(得分:1)
首先,使用delim“,”分割字符串。然后使用array_remove函数删除空字符串。将数组连接回字符串。有一个结尾的“,”。要删除该udf,以删除字符串中最右边的字符。
scala> df.show(false)
+--------------------------------------------------------------------------------------------------------------------------------------------------------+
|my_col |
+--------------------------------------------------------------------------------------------------------------------------------------------------------+
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, |
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, |
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, |
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,160,,162,,,,,,,,,,,,174,,176,,,,,,,,,,,,, |
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, |
|,,,,104,,,,,,,111,,,,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,160,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, |
|,,,103,104,,,,,,,111,,,114,,,,,,,121,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, |
|,101,102,,104,,,,,,,,,113,114,,,,,,,,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,,,162,,,,,,,,,,,,174,,,,,,,,,,184,,,,, |
|,,,,104,,,,,,,,,113,,,,,,,,,,,,,,,,,,131,,,,,,,,,,141,142,143,,,146,,,,150,,,,,155,,157,,,,,162,,,,,,,169,,,,,174,,176,177,178,,,,,,,,,,, |
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,,,,,,174,,176,,,,,,,,,,,,, |
|,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, |
|,,102,,104,,,,,,,,,113,114,,,,,,,,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,,,162,,,,,,,,,,,,174,,,,,,,,,,,,,,, |
|,,,,104,,,,,,,111,112,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, |
|,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, |
|,,102,103,104,,,,,,,,,113,114,,,,,,,121,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,160,,162,,,,,,,,,,,173,174,,176,,178,,,,,,,,,,,|
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,,,,,,174,,176,,,,,,,,,,,,, |
|,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, |
|,,,103,104,,,,,,,111,,,114,,,,,,,121,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, |
|,,,,104,,,,,,,111,,,114,,,,,,,,,,,,,,,,,131,,,,,,,,,,,,,,,,,,,,,,,,,,157,,,,,162,,,,,,,169,,,,,174,,176,,,,,,,,,,,,, |
|,,102,,104,,,,,,,,,113,114,,,,,,,,,,,,,,,,130,131,,,,,,,,,,141,142,143,,,146,,,,150,,152,,,,,157,,,,,162,,,,,,,,,,,,174,,,,,,,,,,,,,,, |
+--------------------------------------------------------------------------------------------------------------------------------------------------------+
scala> df.select(trim(array_join(array_remove(split($"my_col", ","), ""),",")) as "my_col").show(false)
+-----------------------------------------------------------------------------------+
|my_col |
+-----------------------------------------------------------------------------------+
|104,111,114,131,157,162,169,174,176, |
|104,111,114,131,157,162,169,174,176, |
|104,111,114,131,157,162,169,174,176, |
|104,111,114,131,157,160,162,174,176, |
|104,111,114,131,157,162,169,174,176, |
|104,111,131,157,160,162,169,174,176, |
|103,104,111,114,121,131,157,162,169,174,176, |
|101,102,104,113,114,130,131,141,142,143,146,150,152,157,162,174,184, |
|104,113,131,141,142,143,146,150,155,157,162,169,174,176,177,178, |
|104,111,114,131,157,162,174,176, |
| |
|102,104,113,114,130,131,141,142,143,146,150,152,157,162,174, |
|104,111,112,114,131,157,162,169,174,176, |
| |
|102,103,104,113,114,121,130,131,141,142,143,146,150,152,157,160,162,173,174,176,178|
|104,111,114,131,157,162,174,176, |
| |
|103,104,111,114,121,131,157,162,169,174,176, |
|104,111,114,131,157,162,169,174,176, |
|102,104,113,114,130,131,141,142,143,146,150,152,157,162,174, |
+-----------------------------------------------------------------------------------+
scala> val myUdf = udf{(x:String) => if(x.endsWith(",")){x.dropRight(1)} else {x}}
myUdf: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,StringType,Some(List(StringType)))
scala> df.select(myUdf(trim(array_join(array_remove(split($"my_col", ","), ""),","))) as "my_col").show(false)
+-----------------------------------------------------------------------------------+
|my_col |
+-----------------------------------------------------------------------------------+
|104,111,114,131,157,162,169,174,176 |
|104,111,114,131,157,162,169,174,176 |
|104,111,114,131,157,162,169,174,176 |
|104,111,114,131,157,160,162,174,176 |
|104,111,114,131,157,162,169,174,176 |
|104,111,131,157,160,162,169,174,176 |
|103,104,111,114,121,131,157,162,169,174,176 |
|101,102,104,113,114,130,131,141,142,143,146,150,152,157,162,174,184 |
|104,113,131,141,142,143,146,150,155,157,162,169,174,176,177,178 |
|104,111,114,131,157,162,174,176 |
| |
|102,104,113,114,130,131,141,142,143,146,150,152,157,162,174 |
|104,111,112,114,131,157,162,169,174,176 |
| |
|102,103,104,113,114,121,130,131,141,142,143,146,150,152,157,160,162,173,174,176,178|
|104,111,114,131,157,162,174,176 |
| |
|103,104,111,114,121,131,157,162,169,174,176 |
|104,111,114,131,157,162,169,174,176 |
|102,104,113,114,130,131,141,142,143,146,150,152,157,162,174 |
+-----------------------------------------------------------------------------------+
答案 1 :(得分:1)
您可以使用regexp_replace:
val df_cleaned = df.withColumn("cleaned", regexp_replace(col("my_col"), ",+", ","))
.withColumn("cleaned", regexp_replace(col("cleaned"), "^,", ""))
.withColumn("cleaned", regexp_replace(col("cleaned"), ",$", ""))
第一行删除所有重复的逗号,第二和第三行删除开头和结尾的逗号。