如果我有两个表(File1)和(File2)
> dput(File1)
structure(list(Column.1 = structure(1:3, .Label = c("Row 1",
"Row 2", "Row 3"), class = "factor"), Column.2 = c(NA, NA, NA
), Column.3 = c(NA, NA, NA), colNames = c(TRUE, TRUE, TRUE)), class = "data.frame", row.names = c(NA,
-3L))
> dput(File2)
structure(list(Column.1 = structure(1:3, .Label = c("Row 1",
"Row 2", "Row 3"), class = "factor"), Column.2 = c(1, 2, 34),
Column.3 = c(NA, NA, NA), colNames = c(TRUE, TRUE, TRUE)), class = "data.frame", row.names = c(NA,
-3L))
我要确认文件1和文件2之间的列名,列类型和行数和列数,如果它们全部相同则返回TRUE,否则返回FALSE,如何添加到这段代码我写的?
我尝试了Compare column types between two data frames中的一些答案,但我只是寻找一个真或假的答案。这是我目前的代码。
check_file <- function(File1 , File2) {
if (!nrow(File1) == nrow(File2)) {
print("Non matching number of rows")
return(FALSE)
} else if (!ncol(File1) == ncol(File2)) {
print("non matching number of columns")
return(FALSE)
} else if (length(grep("FALSE", names(File1) == names(File2)))>0){
print("Non matching names of columns")
return(FALSE)
}else if (!class(File1)==class(File2)){
print("Non matching column types")
}
return(TRUE)
}
check <- check_file(File1, File2)
if (check) {
return(TRUE)
} else{
return(FALSE)
}
我认为剩下的就是各种类型。例如,在dput文件2中,列2具有数字,而文件1具有NA。它们不必是相同的数字,但它需要返回false,因为它是NA。如果文件1有3,2,564,则应返回TRUE。
答案 0 :(得分:1)
all( # check if all ar T
sapply( #
c(colnames, dim, function(x){sapply(x, class)}), # functions to apply
function(f) all(f(File1) == f(File2)) # check 4 equality
) #
) #
[1] FALSE # numeric != logical
#all(
# sapply(
# c(colnames, dim, function(x){sapply(x, class)}),
# function(f) all(f(File1) == f(File1))
# )
# )
#[1] TRUE
[编辑0] dim
而不是nrow
。
[编辑1]
如果两列有不同的类,但其中一列为空 - 返回TRUE
:
df1 <- data.frame(Column1 = paste("Row", 1:3), Column2 = 1:3,
Column3 = NA, colNames = TRUE)
df2 <- df1; df2[, 2] <- c(1, 2, 34)
df3 <- data.frame(Column1 = paste("Row", 1:3), Column2 = NA, Column3 = NA)
df4 <- df3
df4[, 2] <- "ddd"
df4[, 3] <- c(3, 4, 2)
df1
# Column1 Column2 Column3 colNames
#1 Row 1 1 NA TRUE
#2 Row 2 2 NA TRUE
#3 Row 3 3 NA TRUE
df2
# Column1 Column2 Column3 colNames
#1 Row 1 1 NA TRUE
#2 Row 2 2 NA TRUE
#3 Row 3 34 NA TRUE
请注意class(df1[,2]) == "integer"
但class(df2[,2]) == "numeric"
df3
# Column1 Column2 Column3
#1 Row 1 NA NA
#2 Row 2 NA NA
#3 Row 3 NA NA
df4
# Column1 Column2 Column3
#1 Row 1 ddd 3
#2 Row 2 ddd 4
#3 Row 3 ddd 2
identical_df <- function(x, y){
ifelse(!identical(colnames(x), colnames(y)), FALSE,
ifelse(!identical(dim(x), dim(y)), FALSE,
all((sapply(x, class) == sapply(y, class)) |
(apply(is.na(x), 2, prod) | apply(is.na(y), 2, prod))
)
)
)
}
df1
,df2
上测试该功能; df1
,df3
; df3
,df4
identical_df(df1, df1) # identical
#[1] TRUE #
identical_df(df1, df2) # class(df1[,2]) != class(df2[,2])
#[1] FALSE
identical_df(df1, df3) # dim(df1) != dim(df3)
#[1] FALSE
identical_df(df3, df4) # different classes for cols 2, 3
#[1] TRUE # however both cols 2, 3 in df3 are empty (NAs)
# ==============================================================================
# Evaluation of
# all((sapply(x, class) == sapply(y, class)) |
# (apply(is.na(x), 2, prod) | apply(is.na(y), 2, prod))
# )
# for x = df3, y = df4
#
# +-------------------------------------------------+--------+--------+--------+
# |Expression |Column1 |Column2 |Column3 |
# +-------------------------------------------------+--------+--------+--------+
# |sapply(x, class) == sapply(y, class) +--------<|TRUE |FALSE |FALSE |
# + | +--------+--------+--------+
# |apply(is.na(x), 2, prod) | +--<|0 |1 |1 |
# + OR-+<OR | | | |
# |apply(is.na(y), 2, prod) | | +--<|0 |0 |0 |
# | | | | | | |
# | | +----->|FALSE |TRUE |TRUE |
# | | | | | |
# | | +--------+--------+--------+
# | +-------->|TRUE |TRUE |TRUE |
# +-------------------------------------------------+--------+--------+--------+
答案 1 :(得分:0)
您可以使用identical
功能。
使用您的功能:
check_file <- function(File1 , File2) {
if (identical(summary.default(File1)[,3],
summary.default(File2)[,3]) == FALSE) {
print("Not Same Str")
return(FALSE)}
if (identical(class(File1), class(File2)) == FALSE) {
print("Not Same Class")
return(FALSE)}
if (identical(names(File1), names(File2)) == FALSE) {
print("Non matching number of rows")
return(FALSE)}
if (identical(dim(File1), dim(File2)) == FALSE) {
print("non matching number of columns")
return(FALSE)
} else if (length(grep("FALSE", names(File1) == names(File2)))>0){
print("Non matching names of columns")
return(FALSE)
}else if (!class(File1)==class(File2)){
print("Non matching column types")
}
return(TRUE)
}
测试您的data.frames:
File1 <- structure(list(Column.1 = structure(1:3, .Label = c("Row 1",
"Row 2", "Row 3"), class = "factor"), Column.2 = c(NA, NA, NA
), Column.3 = c(NA, NA, NA), colNames = c(TRUE, TRUE, TRUE)), class = "data.frame", row.names = c(NA,
-3L))
File2 <-
structure(list(Column.1 = structure(1:3, .Label = c("Row 1",
"Row 2", "Row 3"), class = "factor"), Column.2 = c(1, 2, 34),
Column.3 = c(NA, NA, NA), colNames = c(TRUE, TRUE, TRUE)), class = "data.frame", row.names = c(NA,
-3L))
check <- check_file(File1, File2)
check
[1] TRUE
或者使用不匹配的行数:
df1 <- data.frame(x = 1:20)
df2 <- data.frame(x = 1:10)
check <- check_file(df1, df2)
[1] "non matching number of columns"
check
[1] FALSE