我的df结构包含以下列:
java.io.FileNotFoundException: C:\Users\USER\AppData\Local\Temp\blockmgr-634044d0-c0c8-43c2-b383-aa4c4a1f4f32\36\temp_shuffle_54d1a4b0-8738-4f54-becf-39d8c0d7a129 (El sistema no puede encontrar la ruta especificada)
at java.io.FileOutputStream.open0(Native Method)
at java.io.FileOutputStream.open(FileOutputStream.java:270)
at java.io.FileOutputStream.<init>(FileOutputStream.java:213)
at org.apache.spark.storage.DiskBlockObjectWriter$$anonfun$revertPartialWritesAndClose$2.apply$mcV$sp(DiskBlockObjectWriter.scala:217)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1386)
at org.apache.spark.storage.DiskBlockObjectWriter.revertPartialWritesAndClose(DiskBlockObjectWriter.scala:214)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.stop(BypassMergeSortShuffleWriter.java:237)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:102)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
18/03/21 14:27:52 ERROR BypassMergeSortShuffleWriter: Error while deleting file C:\Users\USER\AppData\Local\Temp\blockmgr-634044d0-c0c8-43c2-b383-aa4c4a1f4f32\36\temp_shuffle_54d1a4b0-8738-4f54-becf-39d8c0d7a129
18/03/21 14:27:52 ERROR Executor: Exception in task 0.0 in stage 710.0 (TID 706)
java.io.FileNotFoundException: C:\Users\USER\AppData\Local\Temp\blockmgr-634044d0-c0c8-43c2-b383-aa4c4a1f4f32\36\temp_shuffle_54d1a4b0-8738-4f54-becf-39d8c0d7a129 (El sistema no puede encontrar la ruta especificada)
at java.io.FileOutputStream.open0(Native Method)
at java.io.FileOutputStream.open(FileOutputStream.java:270)
at java.io.FileOutputStream.<init>(FileOutputStream.java:213)
at org.apache.spark.storage.DiskBlockObjectWriter.initialize(DiskBlockObjectWriter.scala:103)
at org.apache.spark.storage.DiskBlockObjectWriter.open(DiskBlockObjectWriter.scala:116)
at org.apache.spark.storage.DiskBlockObjectWriter.write(DiskBlockObjectWriter.scala:237)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:151)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
18/03/21 14:27:52 WARN TaskSetManager: Lost task 0.0 in stage 710.0 (TID 706, localhost, executor driver): java.io.FileNotFoundException: C:\Users\USER\AppData\Local\Temp\blockmgr-634044d0-c0c8-43c2-b383-aa4c4a1f4f32\36\temp_shuffle_54d1a4b0-8738-4f54-becf-39d8c0d7a129 (El sistema no puede encontrar la ruta especificada)
at java.io.FileOutputStream.open0(Native Method)
at java.io.FileOutputStream.open(FileOutputStream.java:270)
at java.io.FileOutputStream.<init>(FileOutputStream.java:213)
at org.apache.spark.storage.DiskBlockObjectWriter.initialize(DiskBlockObjectWriter.scala:103)
at org.apache.spark.storage.DiskBlockObjectWriter.open(DiskBlockObjectWriter.scala:116)
at org.apache.spark.storage.DiskBlockObjectWriter.write(DiskBlockObjectWriter.scala:237)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:151)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
18/03/21 14:27:52 ERROR TaskSetManager: Task 0 in stage 710.0 failed 1 times; aborting job
18/03/21 14:27:52 INFO TaskSchedulerImpl: Removed TaskSet 710.0, whose tasks have all completed, from pool
18/03/21 14:27:52 INFO TaskSchedulerImpl: Cancelling stage 710
18/03/21 14:27:52 INFO DAGScheduler: ShuffleMapStage 710 (flatMap at RandomForest.scala:921) failed in 0,044 s due to Job aborted due to stage failure: Task 0 in stage 710.0 failed 1 times, most recent failure: Lost task 0.0 in stage 710.0 (TID 706, localhost, executor driver): java.io.FileNotFoundException: C:\Users\USER\AppData\Local\Temp\blockmgr-634044d0-c0c8-43c2-b383-aa4c4a1f4f32\36\temp_shuffle_54d1a4b0-8738-4f54-becf-39d8c0d7a129 (El sistema no puede encontrar la ruta especificada)
at java.io.FileOutputStream.open0(Native Method)
at java.io.FileOutputStream.open(FileOutputStream.java:270)
at java.io.FileOutputStream.<init>(FileOutputStream.java:213)
at org.apache.spark.storage.DiskBlockObjectWriter.initialize(DiskBlockObjectWriter.scala:103)
at org.apache.spark.storage.DiskBlockObjectWriter.open(DiskBlockObjectWriter.scala:116)
at org.apache.spark.storage.DiskBlockObjectWriter.write(DiskBlockObjectWriter.scala:237)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:151)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
18/03/21 14:27:52 INFO DAGScheduler: Job 706 failed: collectAsMap at RandomForest.scala:928, took 0,045648 s
每个userID有多行,有许多不同的用户ID。事件将是整数&gt; = 0。
我需要R来查找MAXIMUM行ID,其中特定用户ID的事件大于0,然后将该userID的任何后续行标记为新列中的“after”(否则,将其标记为“之前”) )。
示例:
RowID, UserID, Event
我对R来说是全新的,所以甚至不确定从哪里开始实现这一目标。我知道如何在Excel中完成它但我的CSV太大而无法进行计算。
提前致谢。
答案 0 :(得分:0)
使用基本R功能这是一项棘手的任务。此解决方案使用dplyr
包,如果您正在进行R编程,我建议您学习。
生成一些数据:
library(dplyr)
df <- data.frame(rowID = 1:5, userID = c(999,999,999,111,111), event = c(0,1,0,1,1))
df
rowID userID event
1 1 999 0
2 2 999 1
3 3 999 0
4 4 111 1
5 5 111 1
将行仅过滤到event
等于1的那些行,按用户ID分组,并计算最大行ID。
df %>% filter(event == 1) %>% group_by(userID) %>% summarise(maxR = max(rowID))
# A tibble: 2 x 2
userID maxR
<dbl> <dbl>
1 111 5
2 999 2
答案 1 :(得分:0)
下面您将逐步了解如何计算output
列。
请注意,我添加了一个没有大于0的事件的用户,这导致NA
作为最大rowID
,并在额外分配中处理。
> df <- read.table(header = TRUE, sep=",", text = "rowID, userID, event
+ 1, 999, 0
+ 2, 999, 1
+ 3, 999, 0
+ 4, 100, 0
+ 5, 100, 1
+ 6, 100, 0
+ 7, 100, 1
+ 8, 100, 0
+ 9, 100, 0
+ 10, 101, 0
+ 11, 101, 0
+ 12, 102, 1
+ ")
>
> ## filter events
> df1 <- df[df$event > 0,]
> ## calculate max rowID per user
> max <- setNames(aggregate(df1$rowID, by = list(df1$userID), max) , c("userID", "maxRowID"))
> max
userID maxRowID
1 100 7
2 102 12
3 999 2
>
> ## merge the max to the dataframe
> mrg <- merge(x = df, y = max, by = "userID" , all.x = TRUE)
> ## establish the original order
> mrg <- mrg[with(mrg, order(rowID)), ]
> mrg
userID rowID event maxRowID
10 999 1 0 2
11 999 2 1 2
12 999 3 0 2
1 100 4 0 7
2 100 5 1 7
5 100 6 0 7
6 100 7 1 7
3 100 8 0 7
4 100 9 0 7
7 101 10 0 NA
8 101 11 0 NA
9 102 12 1 12
>
> ## calculate output,
> output <- ifelse( mrg$rowID > mrg$maxRowID,'after','before')
> ## consider also case with no event > 0
> output[is.na(output)] <- 'before'
>
> ## add the output column to the original dataframe
> df$output <- output
> df
rowID userID event output
1 1 999 0 before
2 2 999 1 before
3 3 999 0 after
4 4 100 0 before
5 5 100 1 before
6 6 100 0 before
7 7 100 1 before
8 8 100 0 after
9 9 100 0 after
10 10 101 0 before
11 11 101 0 before
12 12 102 1 before
>