我正在尝试使用data.table::fread
进行dq检查。所以我写下面的代码。我从文件传递所有变量的类。但是在for循环中,所有变量都被转换为character
,因此如果循环失败。
#Select a file
file <- file.choose()
#seperator
seps <- ","
library(data.table)
#read first 100 rows
data <- fread(file, sep= seps, header = TRUE, na.strings = c("NA",".",""," "), nrows = 100)
#get class of all the variables
classes <- sapply(data, class)
col_names <- colnames(data)
reference <- cbind(col_names, classes)
write.csv(reference,"reference.csv", , row.names = FALSE)
classes_new <- read.csv("reference.csv")
classes_final <- as.vector(classes_new[,2])
cols <- ncol(data)
num_qc <- data.frame(non_miss = numeric(0), miss= numeric(0), miss_percent= numeric(0), mean_val= numeric(0), min_val= numeric(0), quant_5= numeric(0), quant_99= numeric(0), max_val= numeric(0))
#for loop to check class of variable and then run the if loop
for (i in 1:40){
classes <- paste0("classes_final","[",i,"]")
sel = i
data_new <- fread(file, header = TRUE, na.strings = c("NA",".",""," "), colClasses = classes, select = sel)
num.data <- data_new[[1]]
coln <- colnames(data_new)
if(class(paste0("data_new","$",coln))== "numeric"){
#if (class(data_new[[1]]) == "numeric")
#column name
#name <- colnames(data_new)
# non-missing values
n.non.miss <- sum(!is.na(num.data))
# missing values
n.miss <- sum(is.na(num.data))
# missing percentage
n.miss.percent <- 100*(n.miss/nrow(data_new))
# mean value
n.mean <- mean(num.data, na.rm= TRUE)
# minimum value
n.min <- min(num.data, na.rm= TRUE)
# maximum value
n.max <- max(num.data, na.rm= TRUE)
# quantiles
n.quant5 <- quantile(num.data, probs= 0.50, na.rm= TRUE)
# quantiles
n.quant99 <- quantile(num.data, probs= 0.99, na.rm= TRUE)
# combine all results in data frame
n.output <- cbind(n.non.miss, n.miss, n.miss.percent, n.mean, n.min, n.quant5, n.quant99,n.max )}}