R:将多年假人转换为单因子变量

时间:2015-12-02 19:01:10

标签: r

R中的我的数据集包含年份变量的多个虚拟变量(year1,year2,year3等)。我如何将这些虚拟变换成与1995,1996,1997相对应的“年”变量?

在Stata我会做这样的事情:

gen year=0 
replace year=1995 if year1==1
replace year=1996 if year2==1

dput数据:

structure(list(wkd_ind = c(123L, 140L, 177L, 127L, 285L, 227L, 
333L, 135L, 124L, 395L, 104L, 362L, 204L, 309L, 510L, 154L, 276L, 
409L, 262L, 168L), assaults = c(2661L, 2845L, 3361L, 2490L, 5493L, 
4213L, 6579L, 2653L, 2849L, 6944L, 1650L, 5312L, 2917L, 4414L, 
7593L, 2041L, 5470L, 5531L, 4651L, 3159L), attend_v = c(0.74936, 
2.2334, 0.075539, 5.4919, 5.1195, 0.29706, 0.43023, 6.7021, 0.82108, 
0.49968, 3.0424, 0.15407, 2.0871, 0.081484, 0.7144, 9.9863, 3.7653, 
1.2931, 0.64987, 0.1372), attend_m = c(7.523, 6.4573, 14.575, 
5.2794, 7.5652, 10.649, 8.5319, 6.5313, 6.1471, 5.7738, 3.3895, 
3.42, 7.5825, 6.0173, 2.7251, 2.8784, 1.7649, 9.5522, 10.834, 
12.922), attend_n = c(5.5719, 2.5885, 8.3358, 4.2664, 6.3695, 
6.4263, 9.0384, 9.6412, 4.7777, 19.82, 20.971, 11.688, 18.561, 
10.305, 13.957, 4.942, 9.9064, 9.3939, 7.1644, 5.7901), h_chris = c(0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L), h_newyr = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), h_easter = c(0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L), h_july4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), h_mem = c(0L, 0L, 
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L), h_labor = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), w_maxa = c(0.16587, 
0.81338, 0.11745, 0.03471, 0.58038, 0.50356, 0.45934, 0.82159, 
0.52968, 0.21778, 0, 0, 0.094779, 0, 0.13667, 0, 0.1637, 0, 0, 
0), w_maxb = c(0, 0.00823, 0.31271, 0, 0.24928, 0, 0.12819, 0.12525, 
0.0092631, 0.67078, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), w_maxc = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0.0041149, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0), w_mina = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.15232, 0.0014491, 
0, 0.00030794, 0, 8.93e-05, 0, 0.00062132, 0.00078076, 0), w_minb = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.038394, 0.036855, 0.22352, 0, 
0.32117, 0.0020882, 0.13658, 0.13159, 0.056588), w_minc = c(0.093716, 
0, 0, 0, 0, 0.041004, 0.018065, 0, 0.059047, 0, 0.12112, 0.13575, 
0.033517, 0.59676, 0, 0.3957, 0.073306, 0.46488, 0.56685, 0.31562
), w_rain = c(0.2167, 0.17555, 0.29204, 0.38594, 0.66403, 0.24707, 
0.36952, 0.33298, 0.25875, 0.28135, 0.58494, 0.71564, 0.033189, 
0.24098, 0.14998, 0.19021, 0.52752, 0.18456, 0.4079, 0.17756), 
    w_snow = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0056599, 0, 
    0.034913, 0, 0.43373, 0, 0.048099, 0.02458, 0.044347), year1 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L), year2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), year3 = c(1L, 
    1L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 
    0L, 0L, 0L, 0L), year4 = c(0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), year5 = c(0L, 
    0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L), year6 = c(0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L), year7 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L), year8 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), year9 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L), year10 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L), month1 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 1L, 0L), month2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), month3 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 1L), month4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), month5 = c(1L, 
    0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L), month6 = c(0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), month7 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L), month8 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), month9 = c(0L, 
    1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L), month10 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L), month11 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 
    0L, 1L, 0L, 0L), month12 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L), year = c(1995L, 
    1997L, 1996L, 1999L, 1998L, 2005L, 1999L, 1995L, 2005L, 1996L, 
    1996L, 2001L, 2002L, 1998L, 2002L, 2005L, 2004L, 2004L, 1996L, 
    2002L)), .Names = c("wkd_ind", "assaults", "attend_v", "attend_m", 
"attend_n", "h_chris", "h_newyr", "h_easter", "h_july4", "h_mem", 
"h_labor", "w_maxa", "w_maxb", "w_maxc", "w_mina", "w_minb", 
"w_minc", "w_rain", "w_snow", "year1", "year2", "year3", "year4", 
"year5", "year6", "year7", "year8", "year9", "year10", "month1", 
"month2", "month3", "month4", "month5", "month6", "month7", "month8", 
"month9", "month10", "month11", "month12", "year"), datalabel = "", time.stamp = "13 Nov 2015 17:05", formats = c("%8.0g", 
"%8.0g", "%12.0g", "%12.0g", "%12.0g", "%8.0g", "%8.0g", "%8.0g", 
"%8.0g", "%8.0g", "%8.0g", "%12.0g", "%12.0g", "%12.0g", "%12.0g", 
"%12.0g", "%12.0g", "%12.0g", "%12.0g", "%8.0g", "%8.0g", "%8.0g", 
"%8.0g", "%8.0g", "%8.0g", "%8.0g", "%8.0g", "%8.0g", "%8.0g", 
"%8.0g", "%8.0g", "%8.0g", "%8.0g", "%8.0g", "%8.0g", "%8.0g", 
"%8.0g", "%8.0g", "%8.0g", "%8.0g", "%8.0g"), types = c(65529L, 
65529L, 65526L, 65526L, 65526L, 65530L, 65530L, 65530L, 65530L, 
65530L, 65530L, 65526L, 65526L, 65526L, 65526L, 65526L, 65526L, 
65526L, 65526L, 65530L, 65530L, 65530L, 65530L, 65530L, 65530L, 
65530L, 65530L, 65530L, 65530L, 65530L, 65530L, 65530L, 65530L, 
65530L, 65530L, 65530L, 65530L, 65530L, 65530L, 65530L, 65530L
), val.labels = c("", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", ""), var.labels = c("", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", ""), version = 117L, label.table = list(), expansion.fields = list(), strl = structure(character(0), .Names = character(0)), byteorder = "LSF", row.names = c(122L, 
139L, 176L, 126L, 284L, 226L, 332L, 134L, 123L, 394L, 103L, 361L, 
203L, 308L, 506L, 153L, 275L, 408L, 261L, 167L), class = "data.frame")

4 个答案:

答案 0 :(得分:2)

这是使用 dplyr tidyr

的单向方式
library(dplyr)
library(tidyr)

d %>% 
    # turn year columns into rows
    gather('year', 'dummy', starts_with('year')) %>%
    # remove extraneous rows created by gather
    filter(dummy == 1) %>%
    # extract the year index and add it to a base year
    mutate(year=extract_numeric(year) + 1994) %>%
    # remove year dummy variable
    select(-dummy)

答案 1 :(得分:1)

以下是该任务的一些基本代码。在代码中df将是您的数据框。

# making an example data frame with two years of dummy variables
set.seed(10)
year1 <- round(runif(10,0,1))
year2 <- 1-year1
df <- as.data.frame(cbind(year1,year2))

# substituting year in for the dummy variables
df$year <- NA
df$year[which(df$year1 %in% 1)] <- 1995
df$year[which(df$year2 %in% 1)] <- 1996 #etc

如果你有很多虚拟变量并且虚拟变量与年份变量没有很好的关联(它们被加扰或在索引中跳过了几年),那么你可以使用一个循环,如下面给出的循环。只要将yearsyear_names变量定义为引用值(虚拟变量和年份),它就相当灵活。当虚拟变量和年份之间存在明确的关系时,使用避免循环的公式可能会更有效。

# names of year dummy variables 
year_names <- c('year1','year2')

# years corresponding to the year1, year2, ... columns
years <- seq(1995,1996,1) 

# initializing column of dataframe
df$year <- NA

# looping over the year dummy variables
for(i in 1:length(year_names)){

  df$year[(df[year_names[i]] == 1)] <- years[i]
}

答案 2 :(得分:1)

(df <- data.frame(y = 1:4, d1 = c(1,0,0,1), d2 = c(0,1,0,0), d3 = c(0,0,1,0))) 
#   y d1 d2 d3
# 1 1  1  0  0
# 2 2  0  1  0
# 3 3  0  0  1
# 4 4  1  0  0
cols <- 2:4 # or c("d1", "d2", "d3") - Dummy variable columns
nval <- 1999:2001 # New corresponding values
df$year <- t(sweep(df[, cols], 2, nval, "*"))[t(df[, cols]) != 0]
df
#   y d1 d2 d3 year
# 1 1  1  0  0 1999
# 2 2  0  1  0 2000
# 3 3  0  0  1 2001
# 4 4  1  0  0 1999

答案 3 :(得分:0)

这个怎么样?

year_cols <- paste0('year', 1:10)
my_data$year <- 1994 + apply(my_data[, year_cols], 1, function(x) which(x==1))

我们将子集分配给包含年份的列。然后我们使用apply函数说:&#34;对于每一行,告诉我哪一列等于1&#34;。这将为您提供一个索引,您可以添加1994,索引1 = 1995,索引2 = 1996等等。

(我以上面my_data声明的内容dput开头)