以上是两个数据文件的链接。下面是我的R脚本。简而言之,R脚本打开这些文件,删除几个不必要的变量,创建其他变量,并且通常使数据集更接近我可以使用的东西。我的问题是,虽然它正在处理Washtenaw文件和其他14个类似文件,但它不适用于西南咨询文件。
问题出现在代码末尾,包含变量rec.cols,rec.colnames和fas。 当我尝试使用rec.colnames作为名称时,我得到的错误是names属性包含的项目多于向量。 我理解这个问题的概念:rec.colnames有大约70个项目,rec.cols应该是0列和1行。但是,情况并非如此。我没有任何其他文件的这个问题。我想有人帮我弄清楚为什么会这样,以及我能做些什么来解决它。
library(data.table)
library(magrittr)
library(dplyr)
library(stringr)
library(tidyr)
# even after the item-level data is deleted from each data set, when all of the data sets are merged, the merged file is still too large to manipulate efficiently
# however, only the data from the initial and last assessments within the year of interest are needed for the reports
# the function below only deletes assessment data that are not from the initial or last assessments
del.mid <- function(file){
lower <- str_match(pattern = "cafas|pecfas", string = file)
upper <- toupper(lower)
fas <- fread(input = paste0("data/", lower, "/items-del/", file), sep = ",", header = TRUE, na.strings = "", stringsAsFactors = FALSE, colClasses = "character", fill = TRUE, data.table = FALSE)
#
# some cases have assessments entered that do not have any scores
# remove all data for these assessments so that they are not used as the initial most recent assessment
#
fas.long <- fas %>%
gather(key = key, value = value, starts_with(paste0(upper, "_E")), na.rm = FALSE) %>%
separate(col = key, into = c("scale", "time", "var"), sep = "_", extra = "merge") %>%
spread(key = var, value = value)
other.vars <- c("episodeNum", "clientAge", "assessID", "VersionNum", "SA_SoftwareID", "SA_ID", "Prog_SoftwareID", "Prog_ID", "assessDate", "daysSinceEpisodeStart", "status", "isLocked", "isDeleted", "isAmendment", "adminDesc", "adminMonths", "adminOtherDesc", "timePeriod", "noSubsequent", "language", "youthLivArrange", "youthLivArrangeIsOther", "youthOtherLivArrange", "assessorSoftwareID", "Primary_CaregiverID", "Primary_CaregiverRelation")
# score variables for the CAFAS and PECFAS are different
if(lower == "cafas") score.vars <- c("TotalScore", "TotalStrengths", "Tier", "ChildMgmtSkills", "PervasBhImpair", "RiskPsychotic", "RiskSevereSubstance", "RiskSuicideAttempt", "RiskSuicideIdeation", "RiskAggresive", "RiskSexual", "RiskFire", "RiskRunaway", "RiskExceedPrimaryCG", "RiskExceedNonCust", "RiskExceedSurrogate", "TotalSevere", "TotalScoreDiff", "MeaningfulReliableDiff", "SevereDiff", "PervasiveImpairDiff", "IndicatorImprove", "SchoolScore", "SchoolStrengthTotal", "HomeScore", "HomeStrengthTotal", "CommunityScore", "CommunityStrengthTotal", "BehaviorScore", "BehaviorStrengthTotal", "MoodScore", "MoodStrengthTotal", "SelfHarmScore", "SelfHarmStrengthTotal", "SubstanceScore", "SubstanceStrengthTotal", "ThinkingScore", "ThinkingStrengthTotal", "CG_P_Needs", "CG_P_Support", "Primary_CG_StrengthTotal", "NonCust_CaregiverID", "NonCust_CaregiverRelation", "NC_CG_Needs", "NC_CG_Support", "NC_CG_StrengthTotal", "Surr_CaregiverID", "Surr_CaregiverRelation", "Surr_CG_Needs", "Surr_CG_Support", "Surr_CG_StrengthTotal")
if(lower == "pecfas") score.vars <- c("TotalScore", "TotalStrengths", "Tier", "ChildMgmtSkills", "PervasBhImpair", "RiskThinking", "RiskSuicideAttempt", "RiskSuicideIdeation", "RiskAggresive", "RiskSexual", "RiskFire", "RiskRunaway", "RiskDelinquent", "RiskExceedPrimaryCG", "RiskExceedNonCust", "RiskExceedSurrogate", "TotalSevere", "TotalScoreDiff", "MeaningfulReliableDiff", "SevereDiff", "PervasiveImpairDiff", "IndicatorImprove", "SchoolScore", "SchoolStrengthTotal", "HomeScore", "HomeStrengthTotal", "CommunityScore", "CommunityStrengthTotal", "BehaviorScore", "BehaviorStrengthTotal", "MoodScore", "MoodStrengthTotal", "SelfHarmScore", "SelfHarmStrengthTotal", "ThinkingScore", "ThinkingStrengthTotal", "CG_P_Needs", "CG_P_Support", "Primary_CG_StrengthTotal", "NonCust_CaregiverID", "NonCust_CaregiverRelation", "NC_CG_Needs", "NC_CG_Support", "NC_CG_StrengthTotal", "Surr_CaregiverID", "Surr_CaregiverRelation", "Surr_CG_Needs", "Surr_CG_Support", "Surr_CG_StrengthTotal")
fas.long[fas.long[, score.vars] %>% is.na %>% rowSums == length(score.vars), other.vars] <- NA
fas.wide <- fas.long %>%
unite(col = scaletime, c(scale, time), sep = "_") %>%
gather(key = var, value = value, adminDesc:youthOtherLivArrange) %>%
unite(col = key, c(scaletime, var), sep = "_") %>%
spread(key = key, value = value, fill = NA)
fas <- fas.wide[, names(fas)]
rm(fas.long, fas.wide, other.vars)
# only interested in cases with assessments within the year of interest for report
# year of interest for report: 10/1/2013 to 9/30/2016
start.date <- as.Date("2009-10-1", format = "%Y-%m-%d")
end.date <- as.Date("2016-9-30", format = "%Y-%m-%d")
# convert date variables to date type
fas[, grep(pattern = "Date", x = names(fas))] <- lapply(X = fas[, grep(pattern = "Date", x = names(fas))], FUN = as.Date, format = "%m/%d/%Y")
#
# for cases with revised initial assessments, replace initial data with revised initial data
#
# create variables that indicate whether a revised assessment is present or absent
# some cases have multiple initial assessments, so check all of them for revisions
# variable "max.e" reflects the maximum number of initial assessments present in the data sets
seq.e <- grep(pattern = "E[1-9]+TInitial_assessDate", x = names(fas), value = TRUE) %>% regmatches(., gregexpr("[0-9]+", .)) %>% unlist
max.e <- seq.e %>% max %>% as.numeric
fas[, sapply(X = seq.e, FUN = function(x) paste0("e", x, "tr.initial"))] <- FALSE
# function to replace initial assessment data with revised data
for(num in seq.e){
datecol <- paste0(upper, "_E", num, "TRInitial_assessDate")
logcol <- paste0("e", num, "tr.initial")
if(length(grep(pattern = paste0("E", num, "TRInitial"), names(fas))) > 0) fas[(fas[ , datecol] >= start.date) & (fas[, datecol] <= end.date) & (!is.na(fas[ , datecol])), logcol] <- TRUE
fas[fas[, logcol] == TRUE, grep(pattern = paste0("E", num, "TInitial"), x = names(fas))] <- fas[fas[, logcol] == TRUE, grep(pattern = paste0("E", num, "TRInitial"), x = names(fas))]
}
rm(num, datecol, logcol)
# delete revised assessment data columns because they are redundant
# prevents them from being considered as a recent assessment
if(length(grep(pattern = paste0("E[1-", max.e, "]TRInitial"), names(fas))) > 0)
fas <- fas[, -grep(pattern = paste0("E[1-", max.e, "]TRInitial"), names(fas))]
#
# create variables to indicate the presence or absence of an initial assessment
#
fas[, sapply(X = seq.e, FUN = function(x) paste0("e", x, "t.initial"))] <- FALSE
for(num in seq.e){
datecol <- paste0(upper, "_E", num, "TInitial_assessDate")
fas[fas[, datecol] >= start.date & fas[, datecol] <= end.date & !is.na(fas[, datecol]), paste0("e", num, "t.initial")] <- TRUE
fas[rowSums(is.na(fas[paste0(upper, "_E", num, "TInitial_", score.vars)])) == length(score.vars), paste0("e", num, "t.initial")] <- FALSE # do not consider an assessment as present if all the scores are missing
}
rm(num, datecol)
# some cases have multiple initial assessments within the year of interest
# use the earliest one
# create a variable that indicates which initial assessment is used
fas$et.initial <- 0
for(num in rev(seq.e)){
fas$et.initial[fas[, paste0("e", num, "t.initial")] == TRUE] <- as.numeric(num)
}
rm(num)
#
# create a set of initial assessment variables that applies to all children
#
# delete cases without an initial assessment
fas <- fas[fas$et.initial != 0, ]
init.cols <- fas %>% select(matches(paste0(upper, "_E1TInitial")))
init.colnames <- grep(pattern = paste0(upper, "_E1TInitial"), x = names(fas), value = TRUE)
init.colnames <- paste0(upper, "_", gsub(pattern = paste0(upper, "_E1T"), replacement = "", x = init.colnames))
names(init.cols) <- init.colnames
fas <- cbind(fas, init.cols)
rm(init.cols, init.colnames)
for(num in seq.e[2:length(seq.e)]){
fas[fas$et.initial == as.numeric(num), grep(pattern = paste0(upper, "_Initial"), x = names(fas))] <- fas[fas$et.initial == as.numeric(num), grep(pattern = paste0(upper, "_E", num, "TInitial"), x = names(fas))]
}
#
# create a set of most recent assessment variables that applies to all children
#
# create variables that indicate number of days that have passed since the initial assessment for each assessment period
days <- lapply(X = fas[, grep(pattern = "assessDate", names(fas))], FUN = function(x) x - fas[, paste0(upper, "_Initial_assessDate")]) %>% unlist %>% matrix(nrow = nrow(fas), byrow = FALSE) %>% data.frame
names(days) <- grep(pattern = "assessDate", x = names(fas), value = TRUE) %>% sub(pattern = paste0(upper, "_"), replacement = "", x = .) %>% sub(pattern = "_assessDate", replacement = "", x = .) %>% paste(., "_days_pass", sep = "")
fas <- cbind(fas, days)
rm(days)
fas$Initial_days_pass <- NULL
# replace 0 or negative days passed with NA
fas[, grep(pattern = "days_pass", x = names(fas))][fas[, grep(pattern = "days_pass", names(fas))] <= 0] <- NA
# the code following the chunk below creates variables for the most recent assessment data
# it is easier to execute the code if cases with only an initial assessment are not in the data set
# add these cases back to the data set later
# some data sets do not have any cases with a most recent assessment
if((rowSums(is.na(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])) == ncol(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])) %>% sum != nrow(fas)) {
init.only <- fas[which(rowSums(is.na(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])) == ncol(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])), ]
fas <- fas[-which(rowSums(is.na(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])) == ncol(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])), ]
# calculate the max days between the initial and most recent assessments
fas$rec_days <- apply(X = fas[grep(pattern = "days_pass", x = names(fas), value = TRUE)], MARGIN = 1, FUN = max, na.rm = TRUE)
# create a variable that indicates the time period of the most recent assessment
days.pass.cols <- fas[grep(pattern = "days_pass", x = names(fas), value = TRUE)]
fas$rec_time <- colnames(days.pass.cols)[apply(X = days.pass.cols, MARGIN = 1, FUN = which.max)] %>% sub(pattern = "_days_pass", replacement = "", x = .)
rm(days.pass.cols)
# create most recent assessment variables
rec.cols <- apply(X = fas, MARGIN = 1, FUN = function(x) x[grep(pattern = paste0(upper, "_Recent_"), names(x))] <- x[grep(pattern = paste(paste0(upper, "_"), x["rec_time"], sep = ""), names(x))]) %>% t %>% data.frame
rec.colnames <- grep(pattern = paste0(upper, "_Initial"), x = names(fas), value = TRUE) %>% gsub(pattern = "Initial", replacement = "Recent", x = .)
names(rec.cols) <- rec.colnames
fas <- cbind(fas, rec.cols)
rm(rec.cols)
}
#
# limit data set to background variables, initial, and most recent assessments
# add cases only with initial assessment back to data set
#
fas <- fas[, c(1:which(colnames(fas) == "E1_endDate"), which(colnames(fas) == "e1tr.initial"):ncol(fas))]
# some data sets do not have any cases with a most recent assessment
if((rowSums(is.na(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])) == ncol(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])) %>% sum != nrow(fas)) {
init.only <- init.only[, c(1:which(colnames(init.only) == "E1_endDate"), which(colnames(init.only) == "e1tr.initial"):ncol(init.only))]
fas <- bind_rows(fas, init.only)
rm(init.only)
}
# add filename to data sets
fas$filename <- file
write.csv(fas, paste0("data/", lower, "/bref/", gsub(pattern = "\\.", replacement = "_bref.", file)), row.names = FALSE)
}
# delete assessment data that are not from the initial or last assessments from all cafas data sets
cafas.files <- list.files("data/cafas/items-del", pattern = ".csv")
for (file in cafas.files){
del.mid(file)
}
# delete assessment data that are not from the initial or last assessments from all pecfas data sets
pecfas.files <- list.files("data/pecfas/items-del", pattern = ".csv")
for (file in pecfas.files){
del.mid(file)
}