R脚本适用于一个数据文件,但不适用于同一类型的另一个

时间:2018-03-30 20:47:41

标签: r dplyr dataset data-cleaning

https://www.dropbox.com/s/yjvesdycs6n91ee/pecfas_100233%20%5Bmi-%20washtenaw%20community%20health%20orgnization%5D.csv?dl=0

https://www.dropbox.com/s/v41dpf64rhysqum/pecfas_100203%20%5Bmi-%20southwest%20counseling%20solutions%5D.csv?dl=0

以上是两个数据文件的链接。下面是我的R脚本。简而言之,R脚本打开这些文件,删除几个不必要的变量,创建其他变量,并且通常使数据集更接近我可以使用的东西。我的问题是,虽然它正在处理Washtenaw文件和其他14个类似文件,但它不适用于西南咨询文件。

问题出现在代码末尾,包含变量rec.cols,rec.colnames和fas。 当我尝试使用rec.colnames作为名称时,我得到的错误是names属性包含的项目多于向量。 我理解这个问题的概念:rec.colnames有大约70个项目,rec.cols应该是0列和1行。但是,情况并非如此。我没有任何其他文件的这个问题。我想有人帮我弄清楚为什么会这样,以及我能做些什么来解决它。

library(data.table)
library(magrittr)
library(dplyr)
library(stringr)
library(tidyr)

# even after the item-level data is deleted from each data set, when all of the data sets are merged, the merged file is still too large to manipulate efficiently
# however, only the data from the initial and last assessments within the year of interest are needed for the reports
# the function below only deletes assessment data that are not from the initial or last assessments

del.mid <- function(file){

  lower <- str_match(pattern = "cafas|pecfas", string = file)
  upper <- toupper(lower) 

  fas <- fread(input = paste0("data/", lower, "/items-del/", file), sep = ",", header = TRUE, na.strings = "", stringsAsFactors = FALSE, colClasses = "character", fill = TRUE, data.table = FALSE)

  #
  # some cases have assessments entered that do not have any scores
  # remove all data for these assessments so that they are not used as the initial most recent assessment
  #

  fas.long <- fas %>%
    gather(key = key, value = value, starts_with(paste0(upper, "_E")), na.rm = FALSE) %>%
    separate(col = key, into = c("scale", "time", "var"), sep = "_", extra = "merge") %>%
    spread(key = var, value = value)

  other.vars <- c("episodeNum", "clientAge", "assessID", "VersionNum", "SA_SoftwareID", "SA_ID", "Prog_SoftwareID", "Prog_ID", "assessDate", "daysSinceEpisodeStart", "status", "isLocked", "isDeleted", "isAmendment", "adminDesc", "adminMonths", "adminOtherDesc", "timePeriod", "noSubsequent", "language", "youthLivArrange", "youthLivArrangeIsOther", "youthOtherLivArrange", "assessorSoftwareID", "Primary_CaregiverID", "Primary_CaregiverRelation")

  # score variables for the CAFAS and PECFAS are different

  if(lower == "cafas") score.vars <- c("TotalScore", "TotalStrengths", "Tier", "ChildMgmtSkills", "PervasBhImpair", "RiskPsychotic", "RiskSevereSubstance", "RiskSuicideAttempt", "RiskSuicideIdeation", "RiskAggresive", "RiskSexual", "RiskFire", "RiskRunaway", "RiskExceedPrimaryCG", "RiskExceedNonCust", "RiskExceedSurrogate", "TotalSevere", "TotalScoreDiff", "MeaningfulReliableDiff", "SevereDiff", "PervasiveImpairDiff", "IndicatorImprove", "SchoolScore", "SchoolStrengthTotal", "HomeScore", "HomeStrengthTotal", "CommunityScore", "CommunityStrengthTotal", "BehaviorScore", "BehaviorStrengthTotal", "MoodScore", "MoodStrengthTotal", "SelfHarmScore", "SelfHarmStrengthTotal", "SubstanceScore", "SubstanceStrengthTotal", "ThinkingScore", "ThinkingStrengthTotal", "CG_P_Needs", "CG_P_Support", "Primary_CG_StrengthTotal", "NonCust_CaregiverID", "NonCust_CaregiverRelation", "NC_CG_Needs", "NC_CG_Support", "NC_CG_StrengthTotal", "Surr_CaregiverID", "Surr_CaregiverRelation", "Surr_CG_Needs", "Surr_CG_Support", "Surr_CG_StrengthTotal")

  if(lower == "pecfas") score.vars <- c("TotalScore", "TotalStrengths", "Tier", "ChildMgmtSkills", "PervasBhImpair", "RiskThinking", "RiskSuicideAttempt", "RiskSuicideIdeation", "RiskAggresive", "RiskSexual", "RiskFire", "RiskRunaway", "RiskDelinquent", "RiskExceedPrimaryCG", "RiskExceedNonCust", "RiskExceedSurrogate", "TotalSevere", "TotalScoreDiff", "MeaningfulReliableDiff", "SevereDiff", "PervasiveImpairDiff", "IndicatorImprove", "SchoolScore", "SchoolStrengthTotal", "HomeScore", "HomeStrengthTotal", "CommunityScore", "CommunityStrengthTotal", "BehaviorScore", "BehaviorStrengthTotal", "MoodScore", "MoodStrengthTotal", "SelfHarmScore", "SelfHarmStrengthTotal", "ThinkingScore", "ThinkingStrengthTotal", "CG_P_Needs", "CG_P_Support", "Primary_CG_StrengthTotal", "NonCust_CaregiverID", "NonCust_CaregiverRelation", "NC_CG_Needs", "NC_CG_Support", "NC_CG_StrengthTotal", "Surr_CaregiverID", "Surr_CaregiverRelation", "Surr_CG_Needs", "Surr_CG_Support", "Surr_CG_StrengthTotal")

  fas.long[fas.long[, score.vars] %>% is.na %>% rowSums == length(score.vars), other.vars] <- NA

  fas.wide <- fas.long %>%
    unite(col = scaletime, c(scale, time), sep = "_") %>%
    gather(key = var, value = value, adminDesc:youthOtherLivArrange) %>%
    unite(col = key, c(scaletime, var), sep = "_") %>%
    spread(key = key, value = value, fill = NA)

  fas <- fas.wide[, names(fas)]
  rm(fas.long, fas.wide, other.vars)

  # only interested in cases with assessments within the year of interest for report
  # year of interest for report: 10/1/2013 to 9/30/2016

  start.date <- as.Date("2009-10-1", format = "%Y-%m-%d")
  end.date <- as.Date("2016-9-30", format = "%Y-%m-%d")

  # convert date variables to date type
  fas[, grep(pattern = "Date", x = names(fas))] <- lapply(X = fas[, grep(pattern = "Date", x = names(fas))], FUN = as.Date, format = "%m/%d/%Y")

  #
  # for cases with revised initial assessments, replace initial data with revised initial data
  #

  # create variables that indicate whether a revised assessment is present or absent
  # some cases have multiple initial assessments, so check all of them for revisions
  # variable "max.e" reflects the maximum number of initial assessments present in the data sets

  seq.e <- grep(pattern = "E[1-9]+TInitial_assessDate", x = names(fas), value = TRUE) %>% regmatches(., gregexpr("[0-9]+", .)) %>% unlist
  max.e <- seq.e %>% max %>% as.numeric

  fas[, sapply(X = seq.e, FUN = function(x) paste0("e", x, "tr.initial"))] <- FALSE

  # function to replace initial assessment data with revised data

  for(num in seq.e){
    datecol <- paste0(upper, "_E", num, "TRInitial_assessDate")
    logcol <- paste0("e", num, "tr.initial")
    if(length(grep(pattern = paste0("E", num, "TRInitial"), names(fas))) > 0) fas[(fas[ , datecol] >= start.date) & (fas[, datecol] <= end.date) & (!is.na(fas[ , datecol])), logcol] <- TRUE
    fas[fas[, logcol] == TRUE, grep(pattern = paste0("E", num, "TInitial"), x = names(fas))] <- fas[fas[, logcol] == TRUE, grep(pattern = paste0("E", num, "TRInitial"), x = names(fas))]
  }
  rm(num, datecol, logcol)

  # delete revised assessment data columns because they are redundant
  # prevents them from being considered as a recent assessment
  if(length(grep(pattern = paste0("E[1-", max.e, "]TRInitial"), names(fas))) > 0)
    fas <- fas[, -grep(pattern = paste0("E[1-", max.e, "]TRInitial"), names(fas))]

  #
  # create variables to indicate the presence or absence of an initial assessment
  #

  fas[, sapply(X = seq.e, FUN = function(x) paste0("e", x, "t.initial"))] <- FALSE

  for(num in seq.e){
    datecol <- paste0(upper, "_E", num, "TInitial_assessDate")
    fas[fas[, datecol] >= start.date & fas[, datecol] <= end.date & !is.na(fas[, datecol]), paste0("e", num, "t.initial")] <- TRUE
    fas[rowSums(is.na(fas[paste0(upper, "_E", num, "TInitial_", score.vars)])) == length(score.vars), paste0("e", num, "t.initial")] <- FALSE # do not consider an assessment as present if all the scores are missing
  }

  rm(num, datecol)

  # some cases have multiple initial assessments within the year of interest
  # use the earliest one
  # create a variable that indicates which initial assessment is used

  fas$et.initial <- 0

  for(num in rev(seq.e)){
    fas$et.initial[fas[, paste0("e", num, "t.initial")] == TRUE] <- as.numeric(num)
  }
  rm(num)

  #
  # create a set of initial assessment variables that applies to all children
  #

  # delete cases without an initial assessment
  fas <- fas[fas$et.initial != 0, ]

  init.cols <- fas %>% select(matches(paste0(upper, "_E1TInitial")))

  init.colnames <- grep(pattern = paste0(upper, "_E1TInitial"), x = names(fas), value = TRUE)
  init.colnames <- paste0(upper, "_", gsub(pattern = paste0(upper, "_E1T"), replacement = "", x = init.colnames))
  names(init.cols) <- init.colnames

  fas <- cbind(fas, init.cols)
  rm(init.cols, init.colnames)

  for(num in seq.e[2:length(seq.e)]){
    fas[fas$et.initial == as.numeric(num), grep(pattern = paste0(upper, "_Initial"), x = names(fas))] <- fas[fas$et.initial == as.numeric(num), grep(pattern = paste0(upper, "_E", num, "TInitial"), x = names(fas))]
  }

  #
  # create a set of most recent assessment variables that applies to all children
  #

  # create variables that indicate number of days that have passed since the initial assessment for each assessment period

  days <- lapply(X = fas[, grep(pattern = "assessDate", names(fas))], FUN = function(x) x - fas[, paste0(upper, "_Initial_assessDate")]) %>% unlist %>% matrix(nrow = nrow(fas), byrow = FALSE) %>% data.frame

  names(days) <- grep(pattern = "assessDate", x = names(fas), value = TRUE) %>% sub(pattern = paste0(upper, "_"), replacement = "", x = .) %>% sub(pattern = "_assessDate", replacement = "", x = .) %>% paste(., "_days_pass", sep = "")

  fas <- cbind(fas, days)
  rm(days)

  fas$Initial_days_pass <- NULL

  # replace 0 or negative days passed with NA
  fas[, grep(pattern = "days_pass", x = names(fas))][fas[, grep(pattern = "days_pass", names(fas))] <= 0] <- NA

  # the code following the chunk below creates variables for the most recent assessment data
  # it is easier to execute the code if cases with only an initial assessment are not in the data set
  # add these cases back to the data set later

  # some data sets do not have any cases with a most recent assessment
  if((rowSums(is.na(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])) == ncol(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])) %>% sum != nrow(fas)) {

    init.only <- fas[which(rowSums(is.na(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])) == ncol(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])), ]
    fas <- fas[-which(rowSums(is.na(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])) == ncol(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])), ]

    # calculate the max days between the initial and most recent assessments
    fas$rec_days <- apply(X = fas[grep(pattern = "days_pass", x = names(fas), value = TRUE)], MARGIN = 1, FUN = max, na.rm = TRUE)

    # create a variable that indicates the time period of the most recent assessment

    days.pass.cols <- fas[grep(pattern = "days_pass", x = names(fas), value = TRUE)]

    fas$rec_time <- colnames(days.pass.cols)[apply(X = days.pass.cols, MARGIN = 1, FUN = which.max)] %>% sub(pattern = "_days_pass", replacement = "", x = .)

    rm(days.pass.cols)

    # create most recent assessment variables

    rec.cols <- apply(X = fas, MARGIN = 1, FUN = function(x) x[grep(pattern = paste0(upper, "_Recent_"), names(x))] <- x[grep(pattern = paste(paste0(upper, "_"), x["rec_time"], sep = ""), names(x))]) %>% t %>% data.frame

    rec.colnames <- grep(pattern = paste0(upper, "_Initial"), x = names(fas), value = TRUE) %>% gsub(pattern = "Initial", replacement = "Recent", x = .)
    names(rec.cols) <- rec.colnames

    fas <- cbind(fas, rec.cols)
    rm(rec.cols)
  }

  #
  # limit data set to background variables, initial, and most recent assessments
  # add cases only with initial assessment back to data set
  #

  fas <- fas[, c(1:which(colnames(fas) == "E1_endDate"), which(colnames(fas) == "e1tr.initial"):ncol(fas))]

  # some data sets do not have any cases with a most recent assessment
  if((rowSums(is.na(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])) == ncol(fas[grep(pattern = "days_pass", names(fas), value = TRUE)])) %>% sum != nrow(fas)) {

    init.only <- init.only[, c(1:which(colnames(init.only) == "E1_endDate"), which(colnames(init.only) == "e1tr.initial"):ncol(init.only))]

    fas <- bind_rows(fas, init.only)
    rm(init.only)
  }

  # add filename to data sets
  fas$filename <- file

  write.csv(fas, paste0("data/", lower, "/bref/", gsub(pattern = "\\.", replacement = "_bref.", file)), row.names = FALSE)

}



# delete assessment data that are not from the initial or last assessments from all cafas data sets

cafas.files <- list.files("data/cafas/items-del", pattern = ".csv")

for (file in cafas.files){
  del.mid(file)
}

# delete assessment data that are not from the initial or last assessments from all pecfas data sets

pecfas.files <- list.files("data/pecfas/items-del", pattern = ".csv")

for (file in pecfas.files){
  del.mid(file)
}

0 个答案:

没有答案