我有以下数据框https://www.dropbox.com/s/c02qu7uobvrc8ku/college_Rda
这是数据样本:(copy+paste
'能够)
educational_history <- structure(list(SCH_COLLEGE_STATUS_1997_09 = structure(c(1L, 1L,
1L, 1L, 5L, 1L, 1L, 5L, 5L, 5L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_1998_09 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_1999_09 = structure(c(3L,
1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2000_09 = structure(c(3L,
3L, 1L, 1L, 1L, 3L, 1L, 3L, 3L, 1L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2001_09 = structure(c(3L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 3L, 1L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2002_09 = structure(c(3L,
3L, 2L, 1L, 1L, 1L, 1L, 3L, 3L, 3L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2003_09 = structure(c(1L,
3L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2004_09 = structure(c(1L,
3L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2005_09 = structure(c(1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 3L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2006_09 = structure(c(1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2007_09 = structure(c(1L,
1L, 1L, 1L, 1L, 3L, 1L, 4L, 1L, 1L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2008_09 = structure(c(1L,
1L, 1L, 1L, 1L, 3L, 1L, 4L, 1L, 1L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2009_09 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2010_09 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 5L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), SCH_COLLEGE_STATUS_2011_09 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L), .Label = c("Not enrolled in college",
"Enrolled in 2-year college", "Enrolled in 4-year college", "Enrolled in Graduate program",
"VALID SKIP", "NON-INTERVIEW"), class = "factor"), PUBID = c(1,
2, 3, 4, 5, 6, 7, 8, 9, 10)), .Names = c("SCH_COLLEGE_STATUS_1997_09",
"SCH_COLLEGE_STATUS_1998_09", "SCH_COLLEGE_STATUS_1999_09", "SCH_COLLEGE_STATUS_2000_09",
"SCH_COLLEGE_STATUS_2001_09", "SCH_COLLEGE_STATUS_2002_09", "SCH_COLLEGE_STATUS_2003_09",
"SCH_COLLEGE_STATUS_2004_09", "SCH_COLLEGE_STATUS_2005_09", "SCH_COLLEGE_STATUS_2006_09",
"SCH_COLLEGE_STATUS_2007_09", "SCH_COLLEGE_STATUS_2008_09", "SCH_COLLEGE_STATUS_2009_09",
"SCH_COLLEGE_STATUS_2010_09", "SCH_COLLEGE_STATUS_2011_09", "PUBID"
), row.names = c(NA, 10L), class = "data.frame")
我想使用该数据生成新的数据框。
我只需要两个领域:PUBID和第一年就读于4年制大学。有关年份的信息位于列名称内。我试过了:
FirstYear4C <- function(ID) {
ndX=which(educational_history$PUBID==ID)
educational_historyNdX=educational_history[ndX,]
year=NA
if (educational_historyNdX$SCH_COLLEGE_STATUS_1997_09=="Enrolled in 4-year college"){
year=1997
return(year)
}
if (educational_historyNdX$SCH_COLLEGE_STATUS_1998_09=="Enrolled in 4-year college"){
year=1998
return(year)
}
if (educational_historyNdX$SCH_COLLEGE_STATUS_1999_09=="Enrolled in 4-year college"){
year=1999
return(year)
}
if (educational_historyNdX$SCH_COLLEGE_STATUS_2000_09=="Enrolled in 4-year college"){
year=2000
return(year)
}
return(NA)
}
FirstYear<-unlist(lapply(X=educational_history$PUBID,FirstYear4C))
FourYearCollege<-data.frame(PUBID=educational_history$PUBID,
FirstYear=FirstYear)
我确信有更好的方法来编写这个功能。不得不逐列复制和粘贴效率非常低。
PUBID 1stYear4YC
1 1999
2 2000
...
6 2000
答案 0 :(得分:1)
library(data.table)
library(reshape2)
data.table(melt(educational_history, id.var = 'PUBID'))[,
list(first.year = sub('.*_([0-9]+)_[0-9]+$',
'\\1',
variable[value == "Enrolled in 4-year college"][1])),
by = PUBID]
# PUBID first.year
# 1: 1 1999
# 2: 2 2000
# 3: 3 NA
# 4: 4 NA
# 5: 5 NA
# 6: 6 2000
# 7: 7 NA
# 8: 8 1999
# 9: 9 2000
#10: 10 2002
分段运行以查看其工作原理。基本的想法是首先转换为长格式,然后很容易得到你想要的。
答案 1 :(得分:0)
假设您的rownames和PUBID与示例数据中的相同
Map(function(x) cbind(year=substr(x,20,26),PUBID=which(df[x]=="Enrolled in 4-year college")),as.list(names(df)[-16]))
[[1]]
year
[1,] "1997_09"
[[2]]
year
[1,] "1998_09"
[[3]]
year PUBID
[1,] "1999_09" "1"
[2,] "1999_09" "8"
[[4]]
year PUBID
[1,] "2000_09" "1"
[2,] "2000_09" "2"
[3,] "2000_09" "6"
[4,] "2000_09" "8"
[5,] "2000_09" "9"
[[5]]
year PUBID
[1,] "2001_09" "1"
[2,] "2001_09" "9"
[[6]]
year PUBID
[1,] "2002_09" "1"
[2,] "2002_09" "2"
[3,] "2002_09" "8"
[4,] "2002_09" "9"
[5,] "2002_09" "10"
[[7]]
year PUBID
[1,] "2003_09" "2"
[2,] "2003_09" "9"
[3,] "2003_09" "10"
[[8]]
year PUBID
[1,] "2004_09" "2"
[2,] "2004_09" "9"
[3,] "2004_09" "10"
[[9]]
year PUBID
[1,] "2005_09" "10"
[[10]]
year
[1,] "2006_09"
[[11]]
year PUBID
[1,] "2007_09" "6"
[[12]]
year PUBID
[1,] "2008_09" "6"
[[13]]
year
[1,] "2009_09"
[[14]]
year
[1,] "2010_09"
[[15]]
year
[1,] "2011_09"
答案 2 :(得分:0)
还有一个答案:
educational_history
require(stringr)
require(plyr)
eh <- melt(educational_history, id.var = "PUBID") ## Long format
eh$enrolled <- str_detect(eh$value, pattern = "^Enrolled in 4")
## Extract year
eh$year <- str_extract(eh$variable, pattern="_[0-9]*_")
eh$year <- as.numeric(str_replace_all(eh$year, pattern="_", replacement= ""))
## Summarize
ddply(eh[eh$enrolled, ], .variables=.(PUBID),
.fun= summarize, FirstYear4YC = min(year))
答案 3 :(得分:0)
#*************************************
# Function to determine the first year attended
#*************************************
getFirstYear <- function(n){
yearsAttended = which(yearsEnrolled[n,])
if (length(yearsAttended) < 1){
firstYear = NA
}else{
firstYear = min(yearsAttended)
}
return(firstYear)
}
#*************************************
# Main Code
#*************************************
#Set up some constants
numRows = nrow(educational_history)
enrolled = "Enrolled"
college = "COLLEGE"
#Determine which columns have attendance data in them
semesters = grep(college, names(educational_history))
yearsEnrolled = unlist(lapply(semesters, function(x) grepl(enrolled,educational_history[,x],ignore.case=FALSE)))
yearsEnrolled = matrix(yearsEnrolled, nrow=numRows, ncol=15)
#Identify the actual years associated with attendance
semesterHeaders = names(educational_history)[semesters]
years = sub("SCH_COLLEGE_STATUS_","",semesterHeaders)
years = as.numeric(sub("_09", "", years))
#Get IDs
PUBID = educational_history$PUBID
#Get first year attended
ndx = unlist(lapply(1:numRows, getFirstYear))
FirstYear4C = unlist(lapply(1:numRows, function(n) years[ndx[n]]))
#Combine data into dataframe
firstYearData = cbind.data.frame(PUBID, FirstYear4C)
#Complete cases
firstYearData<-firstYearData[complete.cases(firstYearData),]
row.names(firstYearData) <- seq(nrow(firstYearData))