我正在尝试根据其分位数为数据条目分配一个虚拟对象。因此,我得到了3个分位数1/3 2/3 3/3,如果杠杆在q1中,则应该在9月份加1。如果q2大于另一列中的1(其他列保持为0)。
这是我的数据示例:
k <- c("gvkey1" , "gvkey1" , "gvkey1" , "gvkey1", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey3", "gvkey3", "gvkey1" , "gvkey1" , "gvkey1" , "gvkey1", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey3", "gvkey3", "gvkey1" , "gvkey1" , "gvkey1" , "gvkey1", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey3", "gvkey3", "gvkey1" , "gvkey1" , "gvkey1" , "gvkey1", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey3", "gvkey3")
l <- c("12/1/2000", "12/1/2000", "12/3/2000", "12/4/2000" , "12/5/2000" , "12/6/2000" , "12/7/2000" , "12/8/2000" , "12/9/2000" , "12/10/2000" , "12/11/2000", "12/1/2000", "12/1/2000", "12/3/2000", "12/4/2000" , "12/5/2000" , "12/6/2000" , "12/7/2000" , "12/8/2000" , "12/9/2000" , "12/10/2000" , "12/11/2000", "12/1/2000", "12/1/2000", "12/3/2000", "12/4/2000" , "12/5/2000" , "12/6/2000" , "12/7/2000" , "12/8/2000" , "12/9/2000" , "12/10/2000" , "12/11/2000", "12/1/2000", "12/1/2000", "12/3/2000", "12/4/2000" , "12/5/2000" , "12/6/2000" , "12/7/2000" , "12/8/2000" , "12/9/2000" , "12/10/2000" , "12/11/2000", "12/1/2000", "12/1/2000", "12/3/2000", "12/4/2000" , "12/5/2000" , "12/6/2000" , "12/7/2000" , "12/8/2000" , "12/9/2000" , "12/10/2000" , "12/11/2000", "12/1/2000", "12/1/2000", "12/3/2000", "12/4/2000" , "12/5/2000" , "12/6/2000" , "12/7/2000" , "12/8/2000" , "12/9/2000" , "12/10/2000" , "12/11/2000")
m <- c(1:66)
y <- structure(list(a = l, b = k, c = m), .Names = c("Date", "gvkey" , "Leverage"),
row.names = c(NA, -66L), class = "data.frame")
y$Date <- as.Date(y$Date, format = "%m/%d/%Y")
test <- data.table(y)
这是如上所述的代码:
# quantile function per date
d1 <- paste("d1") # first breakpoint
test <- test[, (d1) := quantile(Leverage, (1/3)), by = "Date"]
d2 <- paste("d2") #second breakpoint
test <- test[, (d2) := quantile(Leverage, (2/3)), by = "Date"]
# match companies and quantiles
dquant1 <- paste("dquant1")
test <- test[, (dquant1) := ifelse(d1 < quantile(test$Leverage, 1/3), 1, 0), by = "Date"]
dquant2 <- paste("dquant2")
test <- test[, (d33_66) := ifelse((d1 > quantile(test$Leverage, 1/3) && (d2 < quantile(test$Leverage, 2/3))),1,0), by = "Date"]
dquant3 <- paste("dquant3")
test <- test[, (dquant3) := ifelse(d1 > quantile(test$Leverage, 2/3), 1, 0), by = "Date"]
我在原始数据集中遇到的问题是,有时我会在2列(例如1 0 1)中得到2个投资组合/的虚拟对象,这就是我想要解决的问题。对于这个样本,有时我不会得到一个假人。
欢迎任何建议! 谢谢 约翰内斯
答案 0 :(得分:1)
这种方法怎么样?
test %>% rowwise() %>%
mutate(dquant = cut(Leverage,
breaks = c(0,d1,d2,max(Leverage)),
labels = c('100','010','001'))) %>% print(n=Inf)
# A tibble: 66 x 6
Date gvkey Leverage d1 d2 dquant
<date> <chr> <int> <dbl> <dbl> <fct>
1 2000-12-01 gvkey1 1 19.7 38.3 100
2 2000-12-01 gvkey1 2 19.7 38.3 100
3 2000-12-03 gvkey1 3 21.3 39.7 100
4 2000-12-04 gvkey1 4 22.3 40.7 100
5 2000-12-05 gvkey2 5 23.3 41.7 100
6 2000-12-06 gvkey2 6 24.3 42.7 100
7 2000-12-07 gvkey2 7 25.3 43.7 100
8 2000-12-08 gvkey2 8 26.3 44.7 100
9 2000-12-09 gvkey2 9 27.3 45.7 100
10 2000-12-10 gvkey3 10 28.3 46.7 100
11 2000-12-11 gvkey3 11 29.3 47.7 100
12 2000-12-01 gvkey1 12 19.7 38.3 100
13 2000-12-01 gvkey1 13 19.7 38.3 100
14 2000-12-03 gvkey1 14 21.3 39.7 100
15 2000-12-04 gvkey1 15 22.3 40.7 100
16 2000-12-05 gvkey2 16 23.3 41.7 100
17 2000-12-06 gvkey2 17 24.3 42.7 100
18 2000-12-07 gvkey2 18 25.3 43.7 100
19 2000-12-08 gvkey2 19 26.3 44.7 100
20 2000-12-09 gvkey2 20 27.3 45.7 100
21 2000-12-10 gvkey3 21 28.3 46.7 100
22 2000-12-11 gvkey3 22 29.3 47.7 100
23 2000-12-01 gvkey1 23 19.7 38.3 010
24 2000-12-01 gvkey1 24 19.7 38.3 010
25 2000-12-03 gvkey1 25 21.3 39.7 010
26 2000-12-04 gvkey1 26 22.3 40.7 010
27 2000-12-05 gvkey2 27 23.3 41.7 010
28 2000-12-06 gvkey2 28 24.3 42.7 010
29 2000-12-07 gvkey2 29 25.3 43.7 010
30 2000-12-08 gvkey2 30 26.3 44.7 010
31 2000-12-09 gvkey2 31 27.3 45.7 010
32 2000-12-10 gvkey3 32 28.3 46.7 010
33 2000-12-11 gvkey3 33 29.3 47.7 010
34 2000-12-01 gvkey1 34 19.7 38.3 010
35 2000-12-01 gvkey1 35 19.7 38.3 010
36 2000-12-03 gvkey1 36 21.3 39.7 010
37 2000-12-04 gvkey1 37 22.3 40.7 010
38 2000-12-05 gvkey2 38 23.3 41.7 010
39 2000-12-06 gvkey2 39 24.3 42.7 010
40 2000-12-07 gvkey2 40 25.3 43.7 010
41 2000-12-08 gvkey2 41 26.3 44.7 010
42 2000-12-09 gvkey2 42 27.3 45.7 010
43 2000-12-10 gvkey3 43 28.3 46.7 010
44 2000-12-11 gvkey3 44 29.3 47.7 010
45 2000-12-01 NA 45 19.7 38.3 001
46 2000-12-01 NA 46 19.7 38.3 001
47 2000-12-03 NA 47 21.3 39.7 001
48 2000-12-04 NA 48 22.3 40.7 001
49 2000-12-05 NA 49 23.3 41.7 001
50 2000-12-06 NA 50 24.3 42.7 001
51 2000-12-07 NA 51 25.3 43.7 001
52 2000-12-08 NA 52 26.3 44.7 001
53 2000-12-09 NA 53 27.3 45.7 001
54 2000-12-10 NA 54 28.3 46.7 001
55 2000-12-11 NA 55 29.3 47.7 001
56 2000-12-01 NA 56 19.7 38.3 001
57 2000-12-01 NA 57 19.7 38.3 001
58 2000-12-03 NA 58 21.3 39.7 001
59 2000-12-04 NA 59 22.3 40.7 001
60 2000-12-05 NA 60 23.3 41.7 001
61 2000-12-06 NA 61 24.3 42.7 001
62 2000-12-07 NA 62 25.3 43.7 001
63 2000-12-08 NA 63 26.3 44.7 001
64 2000-12-09 NA 64 27.3 45.7 001
65 2000-12-10 NA 65 28.3 46.7 001
66 2000-12-11 NA 66 29.3 47.7 001
更棘手的解决方案如下所示:
d1 <- paste("d1") # first breakpoint
test <- test[, (d1) := quantile(Leverage, (1/3)), by = "Date"]
d2 <- paste("d2") #second breakpoint
test <- test[, (d2) := quantile(Leverage, (2/3)), by = "Date"]
## I will use the '|' operator in dquant
test = test %>% rowwise() %>%
mutate(s = cut(Leverage,
breaks = c(0,d1,d2,max(Leverage)),
labels = c('1|0|0','0|1|0','0|0|1')))
> test
# A tibble: 66 x 6
Date gvkey Leverage d1 d2 dquant
<date> <chr> <int> <dbl> <dbl> <fct>
1 2000-12-01 gvkey1 1 19.7 38.3 1|0|0
2 2000-12-01 gvkey1 2 19.7 38.3 1|0|0
此后,我们必须将dquant列拆分为多个列。
dummy <- data.frame(do.call('rbind',
strsplit(as.character(test$s),'|',fixed=TRUE)))
> dummy
X1 X2 X3
1 1 0 0
2 1 0 0
3 1 0 0
4 1 0 0
5 1 0 0
6 1 0 0
....
最后,您得到了如下答案
test = cbind(test,dummy)
> test
Date gvkey Leverage d1 d2 dquant X1 X2 X3
1 2000-12-01 gvkey1 1 19.66667 38.33333 1|0|0 1 0 0
2 2000-12-01 gvkey1 2 19.66667 38.33333 1|0|0 1 0 0
3 2000-12-03 gvkey1 3 21.33333 39.66667 1|0|0 1 0 0
4 2000-12-04 gvkey1 4 22.33333 40.66667 1|0|0 1 0 0
5 2000-12-05 gvkey2 5 23.33333 41.66667 1|0|0 1 0 0
6 2000-12-06 gvkey2 6 24.33333 42.66667 1|0|0 1 0 0
7 2000-12-07 gvkey2 7 25.33333 43.66667 1|0|0 1 0 0
8 2000-12-08 gvkey2 8 26.33333 44.66667 1|0|0 1 0 0
9 2000-12-09 gvkey2 9 27.33333 45.66667 1|0|0 1 0 0
10 2000-12-10 gvkey3 10 28.33333 46.66667 1|0|0 1 0 0
11 2000-12-11 gvkey3 11 29.33333 47.66667 1|0|0 1 0 0
...