我想从现有的数据框架中创建一个新数据" ab"。新数据框应该看起来像" Newdf"。
%%timeit
ranked_cols = [col + '_ranked' for col in to_rank]
ranked = df[['date_id', 'category'] + to_rank].groupby(['date_id', 'category'], as_index = False).apply(lambda x: rank_fun(x, to_rank))
ranked.columns = ranked_cols
ranked.reset_index(inplace = True)
ranked.set_index('level_1', inplace = True)
1 loop, best of 3: 481 ms per loop
gcols = ['date_id', 'category']
rcols = ['var_1', 'var_2', 'var_3']
%timeit df.groupby(gcols)[rcols].apply(rnk_numpy).add_suffix('_ranked')
100 loops, best of 3: 16.4 ms per loop
%timeit rnk_numba(df, gcols, rcols).head()
1000 loops, best of 3: 1.03 ms per loop
我们可以使用dplyr包吗?如果有,怎么样?谢谢。
答案 0 :(得分:2)
比dplyr
更重要的是,您需要tidyr
:
library(tidyr)
library(dplyr)
ab %>%
gather(Comp, DV) %>%
mutate(Comp = recode(Comp, "C1" = 1, "c2" = 2))
# Comp DV
# 1 1 1
# 2 1 2
# 3 1 3
# 4 1 4
# 5 1 5
# 6 2 11
# 7 2 12
# 8 2 13
# 9 2 14
# 10 2 15
答案 1 :(得分:1)
使用dplyr
和tidyr
会给你一些贴心的结果......
library(tidyr)
library(dplyr)
df2 <- ab %>%
mutate(Order=1:n()) %>%
gather(key=Comp,value=DV,C1,c2) %>%
arrange(Order) %>%
mutate(Comp=recode(Comp,"C1"=1,"c2"=2)) %>%
select(DV,Comp)
df2
DV Comp
1 1 1
2 11 2
3 2 1
4 12 2
5 3 1
6 13 2
7 4 1
8 14 2
9 5 1
10 15 2
答案 2 :(得分:0)
在基础R中可以使用:
Ndf <- ab[rep(1:nrow(ab),1,each=2),]
Ndf[c(seq(2, dim(Ndf)[1], by=2)), ] <- ab[,2]
#or Ndf <- c(t(ab)) as mentioned in the comments instead of above lines
Ndf[, 2] <- rep.int(c(1:2),times=5)
colnames(Ndf) <- c("DV", "CMT1")
>Ndf
# DV CMT1
#1 1 1
#1.1 11 2
#2 2 1
#2.1 12 2
#3 3 1
#3.1 13 2
#4 4 1
#4.1 14 2
#5 5 1
#5.1 15 2
答案 3 :(得分:0)
虽然OP要求dpylr
解决方案,但我觉得有必要寻找data.table
解决方案。所以,FWIW,这是使用melt()
的替代方法。
请注意,此解决方案 not 依赖于ab
中的特定列名称,正如另外两个dplyr
解决方案所做的那样。此外,它也应该在ab
中的两列以上(未经测试)。
library(data.table)
melt(setDT(ab, keep.rownames = TRUE), id.vars = "rn", value.name = "DV"
)[, Comp := rleid(variable)
][order(rn)][, c("rn", "variable") := NULL][]
# DV Comp
# 1: 1 1
# 2: 11 2
# 3: 2 1
# 4: 12 2
# 5: 3 1
# 6: 13 2
# 7: 4 1
# 8: 14 2
# 9: 5 1
#10: 15 2
ab <- structure(list(C1 = 1:5, c2 = 11:15), .Names = c("C1", "c2"),
row.names = c(NA, -5L), class = "data.frame")
ab
# C1 c2
#1 1 11
#2 2 12
#3 3 13
#4 4 14
#5 5 15