我目前正在搜索一种简单的方法来计算事件,而不修改我的日期顺序。我有一个包含大量日期的数据框列,我想计算出现日期的出现次数。
我们说我有这个清单:
data[,1]
18/12/2015
18/12/2015
18/12/2015
01/01/2016
02/02/2016
02/02/2016
我可以使用函数table()
来计算出现的次数:table(data[,1])
但结果将如下所示:
Var freq
01/01/2016 1
02/02/2016 2
18/12/2015 3
我不想要这个订单,我想保留上面显示的原始订单。我正在寻找一个可以取消函数排序的选项,但似乎它不存在。 (函数aggregate()
)
有人有想法吗?
答案 0 :(得分:8)
以下是两个选项。
首先,我将创建一些数据:
> set.seed(123)
> x <- sample(LETTERS[1:5], 10, TRUE)
> x
[1] "B" "D" "C" "E" "E" "A" "C" "E" "C" "C"
此时table(x)
按排序顺序给出结果:
> table(x)
x
A B C D E
1 1 4 1 3
@akrun建议创建一个具有指定级别的因子,它可以获得您想要的顺序:
> y <- factor(x, levels=unique(x))
> table(y)
y
B D C E A
1 1 4 3 1
或者,您也可以根据以下等级重新排序原始表:
> table(x)[rank(unique(x))]
x
B D C E A
1 1 4 3 1
感谢@lmo,更简洁的方法就是:
> table(x)[unique(x)]
x
B D C E A
1 1 4 3 1
答案 1 :(得分:6)
# Your data
data <- read.table(text="18/12/2015
18/12/2015
18/12/2015
01/01/2016
02/02/2016
02/02/2016")
require(data.table)
dt <- data.table( data )
# Your data looks like this:
dt
# V1
#1: 18/12/2015
#2: 18/12/2015
#3: 18/12/2015
#4: 01/01/2016
#5: 02/02/2016
# The result is this:
dt[ , .N , by = V1 ]
# V1 N
#1: 18/12/2015 3
#2: 01/01/2016 1
#3: 02/02/2016 2
答案 2 :(得分:1)
使用dplyr
library(dplyr)
unique(df %>%
group_by(Var) %>%
mutate(count = n()))
#Source: local data frame [3 x 2]
#Groups: V1 [3]
# V1 count
# (fctr) (int)
#1 18/12/2015 3
#2 01/01/2016 1
#3 02/02/2016 2
数据强>
dput(df)
structure(list(Var = structure(c(3L, 3L, 3L, 1L, 2L, 2L), .Label = c("01/01/2016",
"02/02/2016", "18/12/2015"), class = "factor")), .Names = "V1", class = "data.frame", row.names = c(NA,
-6L))
修改强>
更简单的方法(由@lukeA指出)只是,
library(dplyr)
count(df, Var, sort = TRUE)
#Source: local data frame [3 x 2]
# Var n
# (fctr) (int)
#1 18/12/2015 3
#2 02/02/2016 2
#3 01/01/2016 1
答案 3 :(得分:1)
生成时间测试有点困难,因为并非所有答案都需要data.table
输入。这是我做的:
sotos <-function(testdat){
#library(dplyr)
return(count(testdat, V1,sort = TRUE))
}
simon <-function(testdat){
#require(data.table)
dt <- data.table( testdat )
return(dt[ , .N , by = V1 ])
}
mrip <-function(x){
return(table(x)[unique(x)])
}
# make a dataset
set.seed(42)
x<-sample(LETTERS[1:15],1e4,TRUE)
x2 <- data.table(x)
colnames(x2) <- 'V1'
library(microbenchmark)
microbenchmark(sotos(x2),simon(x2),mrip(x),times=10)
Unit: microseconds
expr min lq mean median uq max neval
sotos(x2) 2183.645 2256.855 2984.7473 2352.6430 2507.616 8629.209 10
simon(x2) 770.417 780.338 831.5502 784.7845 846.021 1116.624 10
mrip(x) 745.101 827.206 844.3107 850.4685 865.863 898.021 10
# compare the answers:
> mrip(x)
x
N O E M J H L C K G D B I F A
666 676 659 656 669 631 679 734 677 665 592 672 674 654 696
> t(simon(x2))
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
V1 "N" "O" "E" "M" "J" "H" "L" "C" "K" "G" "D" "B"
N "666" "676" "659" "656" "669" "631" "679" "734" "677" "665" "592" "672"
[,13] [,14] [,15]
V1 "I" "F" "A"
N "674" "654" "696"
> t(sotos(x2))
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
V1 "C" "A" "L" "K" "O" "I" "B" "J" "N" "G" "E" "M"
n "734" "696" "679" "677" "676" "674" "672" "669" "666" "665" "659" "656"
[,13] [,14] [,15]
V1 "F" "H" "D"
n "654" "631" "592"
Per Frank的评论,我删除了data.table
内的simon
电话。新结果是
Unit: microseconds
expr min lq mean median uq max neval
sotos(x2) 2533.274 2708.089 3067.2971 2804.391 2947.218 5598.176 10
simon(x2) 500.154 518.286 621.3618 577.641 740.995 787.179 10
mrip(x) 816.942 950.020 1065.2408 969.007 1282.887 1459.755 10