我有一个超级凌乱的数据集。有两列标识每一行:
" ID" - 这是每条记录的唯一标识符
"代码" - 指项目代码。一个项目"代码"可以有几个记录。
数据集样本:
df <- data.frame(stringsAsFactors=FALSE,
id = c("C01182", "C00966", "C00130", "d34567", "c34567", "C01142",
"C00241", "C00232", "C01094", "C00979", "C00144"),
code = c("13762", "13762", "13762, 13886,13850", "55653", "65247",
"13698", "13698", "13698", "13880", "13773, 13858, 13880", "13773, 13880")
)
我想要的输出示例是:
df1 <- data.frame(stringsAsFactors=FALSE,
id = c("C01182", "C00966", "C00130", "d34567", "c34567", "C01142",
"C00241", "C00232", "C01094", "C00979", "C00144"),
code = c("13762", "13762", "13762, 13886,13850", "55653", "65247",
"13698", "13698", "13698", "13880", "13773, 13858, 13880", "13773, 13880"),
new_col = c("unique_id_1", "unique_id_1", "unique_id_1", "unique_id_2",
"unique_id_3", "unique_id_4", "unique_id_4",
"unique_id_4", "unique_id_5", "unique_id_5", "unique_id_5")
)
new_col
由code
:code
的记录,这些记录也需要标记为一个唯一的集合唯一标识符可以是任何内容,也不一定是示例。
有关此的任何想法都可以实现
答案 0 :(得分:2)
新答案:
Dim i As Integer
Dim ctrlLabel As Control
For i = 1 To 81
Set ctrlLabel = Me.Controls("labelnum" & i)
If ctrlLabel.Caption = Label1.Caption Then
ctrlLabel.BackColor = Label1.BackColor
End If
Set ctrlLabel = Nothing
Next i
给出:
# load 'data.table' package & convert 'df' to a data.table library(data.table) setDT(df) d1 <- df[, .(code = unlist(tstrsplit(code, ',', type.convert = TRUE))), by = id] d2 <- d1[, dcast(.SD, id ~ code, fun = length)][, -1] df[d1[.(code = as.integer(names(d2)), val = colSums(as.matrix(d2))) , on = .(code), val := i.val][, .(code = code[which.max(val)]), by = id] , on = .(id) , new_col := rleid(i.code)][]
这是做什么的:
> df
id code new_col
1: C01182 13762 1
2: C00966 13762 1
3: C00130 13762, 13886,13850 1
4: d34567 55653 2
5: c34567 65247 3
6: C01142 13698 4
7: C00241 13698 4
8: C00232 13698 4
9: C01094 13880 5
10: C00979 13773, 13858, 13880 5
11: C00144 13773, 13880 5
将setDT(df)
转换为df
。data.table
将d1 <- df[, .(code = unlist(tstrsplit(code, ',', type.convert = TRUE))), by = id]
转换为长格式,其中每个df
都有一个新行。code
创建一个广d2 <- d1[, dcast(.SD, id ~ code, fun = length)][, -1]
个data.table
列作为列,每个code
的位置指示id
与code
相关联的次数它。使用[-1]
时,id
列会被删除,因为下一步我们不需要它。.(code = as.integer(names(d2)), val = colSums(as.matrix(d2)))
为每data.table
次code
创建一个很长的临时data.table
。d1
与code
加为on = .(code)
作为加入密钥(codes
),并将计数添加到相应的{{1}通过引用(val := i.val
)。在每次id
之后,只有code
部分选择计数最多的val
(= [, .(code = code[which.max(val)]), by = id]
。df
(id
)加入on = .(id)
,new_col
由创建的<{1}}创建code
(new_col := rleid(code)
)的em> run-lenght-id 。@minem指定的较大数据集的速度比较:
n1 <- 10000
n2 <- 10000
set.seed(20)
ll <- lapply(1:n1, function(x) sample(1:n2, sample(1:5, 1)))
dfl <- data.table(id = 1:n1, code = sapply(ll, paste, collapse = ', '))
时间安排:
> system.time(getGroupsJaap(dfl)) user system elapsed 1.878 0.595 2.479 > system.time(getGroupsMinem(dfl)) user system elapsed 4.332 0.598 4.931
通过将colSums
替换为colSums2
- matrixStats
,可以实现我的方法的进一步(虽然是次要的)。
旧回答:
dfc <- lapply(strsplit(df$code, ','), type.convert)
m <- as.data.table(outer(unlist(dfc),
unlist(dfc),
'==')
)[, lapply(.SD, sum), rep(seq_along(dfc), lengths(dfc))
][, rep := NULL][, t(.SD)]
dt <- data.table(id = rep(df$id, lengths(dfc)),
m)[, grp := .GRP, by = V1:V11
][, rs := rowSums(.SD), .SDcols = 2:12
][, .(grp = grp[rs == max(rs)]), by = id
][, unid := paste0('unique_id_', .GRP), by = grp][]
df[dt, on = .(id), new_col := unid][]
答案 1 :(得分:1)
getGroups <- function(df) {
require(data.table)
setDT(df)
l <- strsplit(df$code, ",")
l <- lapply(l, as.integer)
x <- rep(df$id, times = sapply(l, length))
d <- data.table(id = x, code = unlist(l))
D <- dcast(d, id ~ code, fun.aggregate = length, value.var = 'code')
x <- as.matrix(D[, -1])
g <- rep(0L, nrow(x))
cols <- 1:ncol(x)
i <- cols[1]
test <- colSums(x)
gi <- 1
while (length(cols) > 0) {
r = F
while (r == F) {
y <- rowSums(x[, i, drop = F]) > 0
ssums <- colSums(x[y, , drop = F])
i <- ssums > 0
r <- all(test[i] == ssums[i])
}
g[y] <- gi
gi <- gi + 1
cols <- cols[!(cols %in% which(i))]
i <- cols[1]
}
m <- D[, .(id, group = g)]
results <- merge(df, m, by = 'id', sort = F)
results[]
}
结果:
getGroups(df)
# id code group
# 1: C01182 13762 2
# 2: C00966 13762 2
# 3: C00130 13762, 13886,13850 2
# 4: d34567 55653 4
# 5: c34567 65247 5
# 6: C01142 13698 1
# 7: C00241 13698 1
# 8: C00232 13698 1
# 9: C01094 13880 3
# 10: C00979 13773, 13858, 13880 3
# 11: C00144 13773, 13880 3
为了提高速度,我们可以colSums
/ rowSums
替换colSums2
/ rowSums2
:
getGroups <- function(df) {
require(data.table)
require(matrixStats)
setDT(df) # convert df to data.table
l <- strsplit(df$code, ",") # split codes to list
l <- lapply(l, as.integer) # convert them to integers for efficiency
x <- rep(df$id, times = sapply(l, length)) # create id vector for each code
d <- data.table(id = x, code = unlist(l)) # combine into data.table
# converts the data from long to wide format ( each column represents if id has particular code):
D <- dcast(d, id ~ code, fun.aggregate = length, value.var = 'code')
x <- as.matrix.data.frame(D[, -1]) # convert to matrix and removes id column
g <- rep(0L, nrow(x)) # initialise result vector
cols <- 1:ncol(x) # creates column indices vector
test <- colSums(x) # calculates how much id have each code / for testing if we have selected all
gi <- 1 # first group value
while (length(cols) > 0) {
i <- cols[test[cols] == max(test[cols])][1] # selects code column from remaining which have largest count of id`s
r <- F # initialise indicator if we have selected all id in group
while (r == F) {
if (is.integer(i) != T) i <- which(i == T) # if logical converts to integer indicies
y <- rowSums2(x, cols = i) > 0 # get indices of ids which is in current selection
ssums <- colSums2(x, rows = y) # for those ids get all code columns and cont how many ids have each
i <- ssums > 0 # converts to logical
r <- all(test[i] == ssums[i]) # if selected column sums are equal to initial col sums, then we have selected all one group ids
}
g[y] <- gi # give group id
gi <- gi + 1 # increase group id
cols <- cols[!(cols %in% which(i))] # remove cols that was in this group
}
m <- D[, .(id, group = g)]
results <- merge(df, m, by = 'id', sort = F) # merge group id to initial data
results[]
}
更大数据的时间:
n1 <- 10000
n2 <- 10000
set.seed(20)
ll <- lapply(1:n1, function(x) sample(1:n2, sample(1:5, 1)))
df <- data.table(id = 1:n1, codes = sapply(ll, paste, collapse = ', '))
system.time(wFroups1 <- getGroupsOld(df)) # 17.96
system.time(wFroups2 <- getGroups(df)) #5.35