R中的我的数据集包含年份变量的多个虚拟变量(year1,year2,year3等)。我如何将这些虚拟变换成与1995,1996,1997相对应的“年”变量?
在Stata我会做这样的事情:
gen year=0
replace year=1995 if year1==1
replace year=1996 if year2==1
dput
数据:
structure(list(wkd_ind = c(123L, 140L, 177L, 127L, 285L, 227L,
333L, 135L, 124L, 395L, 104L, 362L, 204L, 309L, 510L, 154L, 276L,
409L, 262L, 168L), assaults = c(2661L, 2845L, 3361L, 2490L, 5493L,
4213L, 6579L, 2653L, 2849L, 6944L, 1650L, 5312L, 2917L, 4414L,
7593L, 2041L, 5470L, 5531L, 4651L, 3159L), attend_v = c(0.74936,
2.2334, 0.075539, 5.4919, 5.1195, 0.29706, 0.43023, 6.7021, 0.82108,
0.49968, 3.0424, 0.15407, 2.0871, 0.081484, 0.7144, 9.9863, 3.7653,
1.2931, 0.64987, 0.1372), attend_m = c(7.523, 6.4573, 14.575,
5.2794, 7.5652, 10.649, 8.5319, 6.5313, 6.1471, 5.7738, 3.3895,
3.42, 7.5825, 6.0173, 2.7251, 2.8784, 1.7649, 9.5522, 10.834,
12.922), attend_n = c(5.5719, 2.5885, 8.3358, 4.2664, 6.3695,
6.4263, 9.0384, 9.6412, 4.7777, 19.82, 20.971, 11.688, 18.561,
10.305, 13.957, 4.942, 9.9064, 9.3939, 7.1644, 5.7901), h_chris = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), h_newyr = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), h_easter = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), h_july4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), h_mem = c(0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), h_labor = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), w_maxa = c(0.16587,
0.81338, 0.11745, 0.03471, 0.58038, 0.50356, 0.45934, 0.82159,
0.52968, 0.21778, 0, 0, 0.094779, 0, 0.13667, 0, 0.1637, 0, 0,
0), w_maxb = c(0, 0.00823, 0.31271, 0, 0.24928, 0, 0.12819, 0.12525,
0.0092631, 0.67078, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), w_maxc = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0.0041149, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0), w_mina = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.15232, 0.0014491,
0, 0.00030794, 0, 8.93e-05, 0, 0.00062132, 0.00078076, 0), w_minb = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.038394, 0.036855, 0.22352, 0,
0.32117, 0.0020882, 0.13658, 0.13159, 0.056588), w_minc = c(0.093716,
0, 0, 0, 0, 0.041004, 0.018065, 0, 0.059047, 0, 0.12112, 0.13575,
0.033517, 0.59676, 0, 0.3957, 0.073306, 0.46488, 0.56685, 0.31562
), w_rain = c(0.2167, 0.17555, 0.29204, 0.38594, 0.66403, 0.24707,
0.36952, 0.33298, 0.25875, 0.28135, 0.58494, 0.71564, 0.033189,
0.24098, 0.14998, 0.19021, 0.52752, 0.18456, 0.4079, 0.17756),
w_snow = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0056599, 0,
0.034913, 0, 0.43373, 0, 0.048099, 0.02458, 0.044347), year1 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), year2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), year3 = c(1L,
1L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L), year4 = c(0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), year5 = c(0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), year6 = c(0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L), year7 = c(0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), year8 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), year9 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), year10 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L), month1 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L), month2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), month3 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L), month4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), month5 = c(1L,
0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), month6 = c(0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), month7 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), month8 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), month9 = c(0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), month10 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L), month11 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 1L, 0L, 0L), month12 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L), year = c(1995L,
1997L, 1996L, 1999L, 1998L, 2005L, 1999L, 1995L, 2005L, 1996L,
1996L, 2001L, 2002L, 1998L, 2002L, 2005L, 2004L, 2004L, 1996L,
2002L)), .Names = c("wkd_ind", "assaults", "attend_v", "attend_m",
"attend_n", "h_chris", "h_newyr", "h_easter", "h_july4", "h_mem",
"h_labor", "w_maxa", "w_maxb", "w_maxc", "w_mina", "w_minb",
"w_minc", "w_rain", "w_snow", "year1", "year2", "year3", "year4",
"year5", "year6", "year7", "year8", "year9", "year10", "month1",
"month2", "month3", "month4", "month5", "month6", "month7", "month8",
"month9", "month10", "month11", "month12", "year"), datalabel = "", time.stamp = "13 Nov 2015 17:05", formats = c("%8.0g",
"%8.0g", "%12.0g", "%12.0g", "%12.0g", "%8.0g", "%8.0g", "%8.0g",
"%8.0g", "%8.0g", "%8.0g", "%12.0g", "%12.0g", "%12.0g", "%12.0g",
"%12.0g", "%12.0g", "%12.0g", "%12.0g", "%8.0g", "%8.0g", "%8.0g",
"%8.0g", "%8.0g", "%8.0g", "%8.0g", "%8.0g", "%8.0g", "%8.0g",
"%8.0g", "%8.0g", "%8.0g", "%8.0g", "%8.0g", "%8.0g", "%8.0g",
"%8.0g", "%8.0g", "%8.0g", "%8.0g", "%8.0g"), types = c(65529L,
65529L, 65526L, 65526L, 65526L, 65530L, 65530L, 65530L, 65530L,
65530L, 65530L, 65526L, 65526L, 65526L, 65526L, 65526L, 65526L,
65526L, 65526L, 65530L, 65530L, 65530L, 65530L, 65530L, 65530L,
65530L, 65530L, 65530L, 65530L, 65530L, 65530L, 65530L, 65530L,
65530L, 65530L, 65530L, 65530L, 65530L, 65530L, 65530L, 65530L
), val.labels = c("", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", ""), var.labels = c("",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", ""), version = 117L, label.table = list(), expansion.fields = list(), strl = structure(character(0), .Names = character(0)), byteorder = "LSF", row.names = c(122L,
139L, 176L, 126L, 284L, 226L, 332L, 134L, 123L, 394L, 103L, 361L,
203L, 308L, 506L, 153L, 275L, 408L, 261L, 167L), class = "data.frame")
答案 0 :(得分:2)
这是使用 dplyr 和 tidyr
的单向方式library(dplyr)
library(tidyr)
d %>%
# turn year columns into rows
gather('year', 'dummy', starts_with('year')) %>%
# remove extraneous rows created by gather
filter(dummy == 1) %>%
# extract the year index and add it to a base year
mutate(year=extract_numeric(year) + 1994) %>%
# remove year dummy variable
select(-dummy)
答案 1 :(得分:1)
以下是该任务的一些基本代码。在代码中df
将是您的数据框。
# making an example data frame with two years of dummy variables
set.seed(10)
year1 <- round(runif(10,0,1))
year2 <- 1-year1
df <- as.data.frame(cbind(year1,year2))
# substituting year in for the dummy variables
df$year <- NA
df$year[which(df$year1 %in% 1)] <- 1995
df$year[which(df$year2 %in% 1)] <- 1996 #etc
如果你有很多虚拟变量并且虚拟变量与年份变量没有很好的关联(它们被加扰或在索引中跳过了几年),那么你可以使用一个循环,如下面给出的循环。只要将years
和year_names
变量定义为引用值(虚拟变量和年份),它就相当灵活。当虚拟变量和年份之间存在明确的关系时,使用避免循环的公式可能会更有效。
# names of year dummy variables
year_names <- c('year1','year2')
# years corresponding to the year1, year2, ... columns
years <- seq(1995,1996,1)
# initializing column of dataframe
df$year <- NA
# looping over the year dummy variables
for(i in 1:length(year_names)){
df$year[(df[year_names[i]] == 1)] <- years[i]
}
答案 2 :(得分:1)
(df <- data.frame(y = 1:4, d1 = c(1,0,0,1), d2 = c(0,1,0,0), d3 = c(0,0,1,0)))
# y d1 d2 d3
# 1 1 1 0 0
# 2 2 0 1 0
# 3 3 0 0 1
# 4 4 1 0 0
cols <- 2:4 # or c("d1", "d2", "d3") - Dummy variable columns
nval <- 1999:2001 # New corresponding values
df$year <- t(sweep(df[, cols], 2, nval, "*"))[t(df[, cols]) != 0]
df
# y d1 d2 d3 year
# 1 1 1 0 0 1999
# 2 2 0 1 0 2000
# 3 3 0 0 1 2001
# 4 4 1 0 0 1999
答案 3 :(得分:0)
这个怎么样?
year_cols <- paste0('year', 1:10)
my_data$year <- 1994 + apply(my_data[, year_cols], 1, function(x) which(x==1))
我们将子集分配给包含年份的列。然后我们使用apply函数说:&#34;对于每一行,告诉我哪一列等于1&#34;。这将为您提供一个索引,您可以添加1994,索引1 = 1995,索引2 = 1996等等。
(我以上面my_data
声明的内容dput
开头)