R中带有quantile()的假人

时间:2019-06-03 13:17:27

标签: r data.table

我正在尝试根据其分位数为数据条目分配一个虚拟对象。因此,我得到了3个分位数1/3 2/3 3/3,如果杠杆在q1中,则应该在9月份加1。如果q2大于另一列中的1(其他列保持为0)。

这是我的数据示例:

k <- c("gvkey1" , "gvkey1" , "gvkey1" , "gvkey1", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey3", "gvkey3", "gvkey1" , "gvkey1" , "gvkey1" , "gvkey1", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey3", "gvkey3", "gvkey1" , "gvkey1" , "gvkey1" , "gvkey1", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey3", "gvkey3", "gvkey1" , "gvkey1" , "gvkey1" , "gvkey1", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey2", "gvkey3", "gvkey3")

l <- c("12/1/2000", "12/1/2000", "12/3/2000", "12/4/2000" , "12/5/2000" , "12/6/2000" , "12/7/2000" , "12/8/2000" , "12/9/2000" , "12/10/2000" , "12/11/2000", "12/1/2000", "12/1/2000", "12/3/2000", "12/4/2000" , "12/5/2000" , "12/6/2000" , "12/7/2000" , "12/8/2000" , "12/9/2000" , "12/10/2000" , "12/11/2000", "12/1/2000", "12/1/2000", "12/3/2000", "12/4/2000" , "12/5/2000" , "12/6/2000" , "12/7/2000" , "12/8/2000" , "12/9/2000" , "12/10/2000" , "12/11/2000", "12/1/2000", "12/1/2000", "12/3/2000", "12/4/2000" , "12/5/2000" , "12/6/2000" , "12/7/2000" , "12/8/2000" , "12/9/2000" , "12/10/2000" , "12/11/2000", "12/1/2000", "12/1/2000", "12/3/2000", "12/4/2000" , "12/5/2000" , "12/6/2000" , "12/7/2000" , "12/8/2000" , "12/9/2000" , "12/10/2000" , "12/11/2000", "12/1/2000", "12/1/2000", "12/3/2000", "12/4/2000" , "12/5/2000" , "12/6/2000" , "12/7/2000" , "12/8/2000" , "12/9/2000" , "12/10/2000" , "12/11/2000")
m <- c(1:66)

y <- structure(list(a = l, b = k, c = m), .Names = c("Date", "gvkey" , "Leverage"),
               row.names = c(NA, -66L), class = "data.frame")

y$Date <- as.Date(y$Date, format = "%m/%d/%Y")

test <- data.table(y)

这是如上所述的代码:

# quantile function per date 
d1 <- paste("d1") # first breakpoint
test <- test[, (d1) := quantile(Leverage, (1/3)), by = "Date"]

d2 <- paste("d2") #second breakpoint
test <- test[, (d2) := quantile(Leverage, (2/3)), by = "Date"]

# match companies and quantiles
dquant1 <- paste("dquant1")
test <- test[, (dquant1) := ifelse(d1 < quantile(test$Leverage, 1/3), 1, 0), by = "Date"]
dquant2 <- paste("dquant2")
test <- test[, (d33_66) := ifelse((d1 > quantile(test$Leverage, 1/3) && (d2 < quantile(test$Leverage, 2/3))),1,0), by = "Date"]
dquant3 <- paste("dquant3")
test <- test[, (dquant3) := ifelse(d1 > quantile(test$Leverage, 2/3), 1, 0), by = "Date"]

我在原始数据集中遇到的问题是,有时我会在2列(例如1 0 1)中得到2个投资组合/的虚拟对象,这就是我想要解决的问题。对于这个样本,有时我不会得到一个假人。

欢迎任何建议! 谢谢 约翰内斯

1 个答案:

答案 0 :(得分:1)

这种方法怎么样?

test %>% rowwise() %>%
 mutate(dquant = cut(Leverage,
                breaks = c(0,d1,d2,max(Leverage)),
                labels = c('100','010','001'))) %>% print(n=Inf)
# A tibble: 66 x 6
   Date       gvkey  Leverage    d1    d2 dquant     
   <date>     <chr>     <int> <dbl> <dbl> <fct>
 1 2000-12-01 gvkey1        1  19.7  38.3 100  
 2 2000-12-01 gvkey1        2  19.7  38.3 100  
 3 2000-12-03 gvkey1        3  21.3  39.7 100  
 4 2000-12-04 gvkey1        4  22.3  40.7 100  
 5 2000-12-05 gvkey2        5  23.3  41.7 100  
 6 2000-12-06 gvkey2        6  24.3  42.7 100  
 7 2000-12-07 gvkey2        7  25.3  43.7 100  
 8 2000-12-08 gvkey2        8  26.3  44.7 100  
 9 2000-12-09 gvkey2        9  27.3  45.7 100  
10 2000-12-10 gvkey3       10  28.3  46.7 100  
11 2000-12-11 gvkey3       11  29.3  47.7 100  
12 2000-12-01 gvkey1       12  19.7  38.3 100  
13 2000-12-01 gvkey1       13  19.7  38.3 100  
14 2000-12-03 gvkey1       14  21.3  39.7 100  
15 2000-12-04 gvkey1       15  22.3  40.7 100  
16 2000-12-05 gvkey2       16  23.3  41.7 100  
17 2000-12-06 gvkey2       17  24.3  42.7 100  
18 2000-12-07 gvkey2       18  25.3  43.7 100  
19 2000-12-08 gvkey2       19  26.3  44.7 100  
20 2000-12-09 gvkey2       20  27.3  45.7 100  
21 2000-12-10 gvkey3       21  28.3  46.7 100  
22 2000-12-11 gvkey3       22  29.3  47.7 100  
23 2000-12-01 gvkey1       23  19.7  38.3 010  
24 2000-12-01 gvkey1       24  19.7  38.3 010  
25 2000-12-03 gvkey1       25  21.3  39.7 010  
26 2000-12-04 gvkey1       26  22.3  40.7 010  
27 2000-12-05 gvkey2       27  23.3  41.7 010  
28 2000-12-06 gvkey2       28  24.3  42.7 010  
29 2000-12-07 gvkey2       29  25.3  43.7 010  
30 2000-12-08 gvkey2       30  26.3  44.7 010  
31 2000-12-09 gvkey2       31  27.3  45.7 010  
32 2000-12-10 gvkey3       32  28.3  46.7 010  
33 2000-12-11 gvkey3       33  29.3  47.7 010  
34 2000-12-01 gvkey1       34  19.7  38.3 010  
35 2000-12-01 gvkey1       35  19.7  38.3 010  
36 2000-12-03 gvkey1       36  21.3  39.7 010  
37 2000-12-04 gvkey1       37  22.3  40.7 010  
38 2000-12-05 gvkey2       38  23.3  41.7 010  
39 2000-12-06 gvkey2       39  24.3  42.7 010  
40 2000-12-07 gvkey2       40  25.3  43.7 010  
41 2000-12-08 gvkey2       41  26.3  44.7 010  
42 2000-12-09 gvkey2       42  27.3  45.7 010  
43 2000-12-10 gvkey3       43  28.3  46.7 010  
44 2000-12-11 gvkey3       44  29.3  47.7 010  
45 2000-12-01 NA           45  19.7  38.3 001  
46 2000-12-01 NA           46  19.7  38.3 001  
47 2000-12-03 NA           47  21.3  39.7 001  
48 2000-12-04 NA           48  22.3  40.7 001  
49 2000-12-05 NA           49  23.3  41.7 001  
50 2000-12-06 NA           50  24.3  42.7 001  
51 2000-12-07 NA           51  25.3  43.7 001  
52 2000-12-08 NA           52  26.3  44.7 001  
53 2000-12-09 NA           53  27.3  45.7 001  
54 2000-12-10 NA           54  28.3  46.7 001  
55 2000-12-11 NA           55  29.3  47.7 001  
56 2000-12-01 NA           56  19.7  38.3 001  
57 2000-12-01 NA           57  19.7  38.3 001  
58 2000-12-03 NA           58  21.3  39.7 001  
59 2000-12-04 NA           59  22.3  40.7 001  
60 2000-12-05 NA           60  23.3  41.7 001  
61 2000-12-06 NA           61  24.3  42.7 001  
62 2000-12-07 NA           62  25.3  43.7 001  
63 2000-12-08 NA           63  26.3  44.7 001  
64 2000-12-09 NA           64  27.3  45.7 001  
65 2000-12-10 NA           65  28.3  46.7 001  
66 2000-12-11 NA           66  29.3  47.7 001

更棘手的解决方案如下所示:

d1 <- paste("d1") # first breakpoint
test <- test[, (d1) := quantile(Leverage, (1/3)), by = "Date"]

d2 <- paste("d2") #second breakpoint
test <- test[, (d2) := quantile(Leverage, (2/3)), by = "Date"]

##    I will use the '|' operator in dquant

test = test %>% rowwise() %>% 
         mutate(s = cut(Leverage,
                        breaks = c(0,d1,d2,max(Leverage)),
                        labels = c('1|0|0','0|1|0','0|0|1'))) 
> test
 # A tibble: 66 x 6
   Date       gvkey  Leverage    d1    d2 dquant
   <date>     <chr>     <int> <dbl> <dbl> <fct> 
 1 2000-12-01 gvkey1        1  19.7  38.3 1|0|0 
 2 2000-12-01 gvkey1        2  19.7  38.3 1|0|0 

此后,我们必须将dquant列拆分为多个列。

dummy <- data.frame(do.call('rbind',
                            strsplit(as.character(test$s),'|',fixed=TRUE)))
> dummy
   X1 X2 X3
1   1  0  0
2   1  0  0
3   1  0  0
4   1  0  0
5   1  0  0
6   1  0  0
....

最后,您得到了如下答案


test = cbind(test,dummy)

> test
         Date  gvkey Leverage       d1       d2 dquant X1 X2 X3
1  2000-12-01 gvkey1        1 19.66667 38.33333  1|0|0  1  0  0
2  2000-12-01 gvkey1        2 19.66667 38.33333  1|0|0  1  0  0
3  2000-12-03 gvkey1        3 21.33333 39.66667  1|0|0  1  0  0
4  2000-12-04 gvkey1        4 22.33333 40.66667  1|0|0  1  0  0
5  2000-12-05 gvkey2        5 23.33333 41.66667  1|0|0  1  0  0
6  2000-12-06 gvkey2        6 24.33333 42.66667  1|0|0  1  0  0
7  2000-12-07 gvkey2        7 25.33333 43.66667  1|0|0  1  0  0
8  2000-12-08 gvkey2        8 26.33333 44.66667  1|0|0  1  0  0
9  2000-12-09 gvkey2        9 27.33333 45.66667  1|0|0  1  0  0
10 2000-12-10 gvkey3       10 28.33333 46.66667  1|0|0  1  0  0
11 2000-12-11 gvkey3       11 29.33333 47.66667  1|0|0  1  0  0
...