我在R中有这种格式的数据集:
#!/usr/bin/expect -f
#Usage sshsudologin.expect <host> <ssh user> <ssh password> <foldername>
set timeout 20
spawn scp -r "/sourcefolder/[lindex $argv 3]" [lindex $argv 1]@[lindex $argv 0]:"/export/home/[lindex $argv 1]/"
expect "yes/no" {
send "yes\r"
expect "*?assword" { send "[lindex $argv 2]\r" }
} "*?assword" { send "[lindex $argv 2]\r" }
expect eof
我需要将其转换为以下格式:
+----------+-------+-----------+
| Person | Group | Timestamp |
+----------+-------+-----------+
| Person A | X | 12:00 PM |
| Person A | X | 12:01 PM |
| Person A | X | 12:03 PM |
| Person A | Y | 12:10 PM |
| Person A | Y | 12:11 PM |
| Person A | Y | 12:12 PM |
| Person A | X | 12:20 PM |
| Person A | X | 12:21 PM |
| Person A | X | 12:22 PM |
| … | | |
+----------+-------+-----------+
(将所有相似的条目分组为1-同一组可以在另一组之后重复 就像上面的例子一样-组是X> Y> X)
我有数百个人,大约有2000万条记录。我尝试运行for循环,但这只花了太多时间。
请告诉我是否有更简单的方法来实现这一目标。
感谢您的帮助。预先感谢。
答案 0 :(得分:3)
这是一个data.table解决方案,应该相当快。
library(data.table)
dt[, .(Ranking = rleid(Group), Group), by = .(Person)][, .SD[1], by = .(Ranking, Person)]
# Person Ranking Group
# 1: Person A 1 X
# 2: Person A 2 Y
# 3: Person A 3 X
(原始方法未单独计算每个人的Rleid,已对其进行编辑以进行修复。)
另一种方法。不知道这样做是否会更快,但是我们可以将问题概念化为保留“人员”或“组”与上一行不同的行,然后按组编号:
dt[is.na(shift(Person)) | shift(Person) != Person | shift(Group) != Group, .(Person, Group)][, Ranking := 1:.N, by = .(Person)][]
# Person Group Ranking
# 1: Person A X 1
# 2: Person A Y 2
# 3: Person A X 3
使用此数据:
dt = fread(" Person | Group | Timestamp
Person A | X | 12:00 PM
Person A | X | 12:01 PM
Person A | X | 12:03 PM
Person A | Y | 12:10 PM
Person A | Y | 12:11 PM
Person A | Y | 12:12 PM
Person A | X | 12:20 PM
Person A | X | 12:21 PM
Person A | X | 12:22 PM", sep = "|")
答案 1 :(得分:1)
library(dplyr)
library(tidyr)
d %>%
group_by(Person) %>%
mutate(Ranking = sequence(rle(Group)$lengths) == 1) %>%
ungroup() %>%
select(-Timestamp) %>%
filter(Ranking) %>%
mutate(Ranking = cumsum(Ranking))
## A tibble: 3 x 3
# Person Group Ranking
# <chr> <chr> <int>
#1 Person A X 1
#2 Person A Y 2
#3 Person A X 3
在基本R中
do.call(rbind, lapply(split(d, d$Person), function(x){
data.frame(Person = x$Person[1],
with(rle(x$Group),
data.frame(Group = values,
Ranking = seq_along(values))))}))
数据
d = structure(list(Person = c("Person A", "Person A", "Person A",
"Person A", "Person A", "Person A",
"Person A", "Person A", "Person A"),
Group = c("X", "X", "X", "Y", "Y", "Y", "X", "X", "X"),
Timestamp = c("12:00 PM", "12:01 PM", "12:03 PM", "12:10 PM",
"12:11 PM", "12:12 PM", "12:20 PM", "12:21 PM",
"12:22 PM")),
class = "data.frame",
row.names = c(NA, -9L))
答案 2 :(得分:1)
这是一个整洁的解决方案,可确保在返回排名之前,在Person中将时间戳按升序排序。
library(tidyverse)
get_ranking <- function(data) {
grps <- rle(data$Group)$values
data.frame(Group = grps, Ranking = seq_along(grps))
}
dat %>%
group_by(Person) %>%
arrange(Timestamp) %>%
group_modify(~ get_ranking(.x))
使用此数据:
dat <- data.frame(Person= 'Person A',
Group=rep(c('X','Y','X'),each=3),
Timestamp=as.POSIXct('2010-01-01 12:00 PM')+(1:9)*60,
stringsAsFactors = FALSE)
要产生此输出:
# A tibble: 3 x 3
# Groups: Person [1]
Person Group Ranking
<chr> <fct> <int>
1 Person A X 1
2 Person A Y 2
3 Person A X 3