我有以下Spark数据帧(sdf):
$ id_op : chr "id1", "id1", "id1","id2", "id2", ...
$ word : chr "w1", "w2", "w3", "w1", "w2", "w3", ...
我是通过“分解/拆分” String列获得的。
我想获得以下sdf:
id_op word
1 id1 w1 w2 w3
2 id2 w1 w2 w3
我可以将dplyr与本地df配合使用,并使用以下代码:
df <- df %>%
group_by(id_op, word)%>%
arrange(id_op) %>%
summarise(clean = paste0(word, collapse=" "))
words.sdf <- words.sdf %>%
aggregate(id_op ~ word, paste, collapse = " ")
但是当我在sparkdf上尝试时,它会产生以下结果:
Error: org.apache.spark.sql.catalyst.parser.ParseException:
extraneous input 'AS' expecting {')', ',', '.', '[', 'OR', 'AND', 'IN', NOT, 'BETWEEN', 'LIKE', RLIKE, 'IS', EQ, '<=>', '<>', '!=', '<', LTE, '>', GTE, '+', '-', '*', '/', '%', 'DIV', '&', '|', '^', STRING}(line 1, pos 63)
== SQL ==
SELECT `id_op`, `word`, CONCAT_WS(" ", `word`, ", " AS "collapse") AS `clean`
---------------------------------------------------------------^^^
FROM (SELECT *
FROM (SELECT `word` AS `word`, `id_op` AS `id_op`
FROM (SELECT *
FROM (SELECT `word`, `id_op`, sum(1.0) OVER (PARTITION BY `id_op`, `word` ROWS UNBOUNDED PRECEDING) AS `index`
FROM (SELECT `word` AS `word`, `id_op` AS `id_op`
FROM (SELECT *
FROM (SELECT `id_op`, `lib_ope`,, EXPLODE(SPLIT(`lib_ope`, " ")) AS `word`
FROM `sparklyr_tmp_12fd627911f22`) `zsbvyxooje`
WHERE (`word` != "")) `mvlzopltjk`) `tutpdiuxoe`) `fykysrcram`
WHERE (`index` = 1.0)) `lpouhsurdu`) `lolphnbaqd`
ORDER BY `id_op) `jsnmrghsuz`
GROUP BY `id_op`, `word`
LIMIT 25
at org.apache.spark.sql.catalyst.parser.ParseException.withCommand(ParseDriver.scala:217)
at org.apache.spark.sql.catalyst.parser.AbstractSqlParser.parse(ParseDriver.scala:114)
at org.apache.spark.sql.execution.SparkSqlParser.parse(SparkSqlParser.scala:48)
at org.apache.spark.sql.catalyst.parser.AbstractSqlParser.parsePlan(ParseDriver.scala:68)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:637)
at sun.reflect.GeneratedMethodAccessor41.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at sparklyr.Invoke$.invoke(invoke.scala:102)
at sparklyr.StreamHandler$.handleMethodCall(stream.scala:97)
at sparklyr.StreamHandler$.read(stream.scala:62)
at sparklyr.BackendHandler.channelRead0(handler.scala:52)
at sparklyr.BackendHandler.channelRead0(handler.scala:14)
at io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:105)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:357)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:343)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:336)
at io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:102)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:357)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:343)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:336)
at io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:293)
at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:267)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:357)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:343)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:336)
at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1294)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:357)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:343)
at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:911)
at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:131)
at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:643)
at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:566)
at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:480)
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:442)
at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:131)
at io.netty.util.concurrent.DefaultThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFactory.java:144)
at java.lang.Thread.run(Thread.java:745)
然后我尝试使用ft_sql_transformer绕过该问题:
df <- df %>%
arrange(id_op)%>%
ft_sql_transformer(sql="SELECT id_op,concat_ws(' ', word) as clean
FROM __THIS__ GROUP BY id_op, clean")
但是会产生完全相同的错误。我已经为此奋斗了几个小时,但我似乎找不到答案。这是我关于stackoverflow的第一篇文章,对不起,如果我不够清楚。
预先感谢您的回答。