我的数据顺序很重要。如果我使用read.csv
将CSV加载到R中,那么数据框保证中的行的顺序是否与CSV的行匹配?
如果我将一堆CSV和rbind
加在一起,然后使用subset
来获取我感兴趣的数据呢?
例如:
1.csv
foo,bar
a,123
a,456
c,789
2.csv
foo,bar
d,987
a,999
b,654
a,321
以下是:
data1<-read.csv("1.csv", header=T)
data2<-read.csv("2.csv", header=T)
all_data<-rbind(data1, data2)
filtered<-subset(all_data, foo=="a")
... 始终生成filtered
:
foo bar
1 a 123
2 a 456
3 a 999
4 a 321
...并且这种行为是否适用于任意CSV输入和过滤器?
答案 0 :(得分:4)
阅读read.table
的源代码。它使用scan
基函数,它本身使用file
和textConnection
函数。所有这些似乎都指向您能够按顺序读取数据(“行”基于分隔符“行”)并将其输入。
function (file, header = FALSE, sep = "", quote = "\"'", dec = ".",
numerals = c("allow.loss", "warn.loss", "no.loss"), row.names,
col.names, as.is = !stringsAsFactors, na.strings = "NA",
colClasses = NA, nrows = -1, skip = 0, check.names = TRUE,
fill = !blank.lines.skip, strip.white = FALSE, blank.lines.skip = TRUE,
comment.char = "#", allowEscapes = FALSE, flush = FALSE,
stringsAsFactors = default.stringsAsFactors(), fileEncoding = "",
encoding = "unknown", text, skipNul = FALSE)
{
if (missing(file) && !missing(text)) {
file <- textConnection(text, encoding = "UTF-8")
encoding <- "UTF-8"
on.exit(close(file))
}
if (is.character(file)) {
file <- if (nzchar(fileEncoding))
file(file, "rt", encoding = fileEncoding)
else file(file, "rt")
on.exit(close(file))
}
if (!inherits(file, "connection"))
stop("'file' must be a character string or connection")
if (!isOpen(file, "rt")) {
open(file, "rt")
on.exit(close(file))
}
pbEncoding <- if (encoding %in% c("", "bytes", "UTF-8"))
encoding
else "bytes"
numerals <- match.arg(numerals)
if (skip > 0L)
readLines(file, skip)
nlines <- n0lines <- if (nrows < 0L)
5
else min(5L, (header + nrows))
lines <- .External(C_readtablehead, file, nlines, comment.char,
blank.lines.skip, quote, sep, skipNul)
if (encoding %in% c("UTF-8", "latin1"))
Encoding(lines) <- encoding
nlines <- length(lines)
if (!nlines) {
if (missing(col.names))
stop("no lines available in input")
rlabp <- FALSE
cols <- length(col.names)
}
else {
if (all(!nzchar(lines)))
stop("empty beginning of file")
if (nlines < n0lines && file == 0L) {
pushBack(c(lines, lines, ""), file, encoding = pbEncoding)
on.exit((clearPushBack(stdin())))
}
else pushBack(c(lines, lines), file, encoding = pbEncoding)
first <- scan(file, what = "", sep = sep, quote = quote,
nlines = 1, quiet = TRUE, skip = 0, strip.white = TRUE,
blank.lines.skip = blank.lines.skip, comment.char = comment.char,
allowEscapes = allowEscapes, encoding = encoding,
skipNul = skipNul)
col1 <- if (missing(col.names))
length(first)
else length(col.names)
col <- numeric(nlines - 1L)
if (nlines > 1L)
for (i in seq_along(col)) col[i] <- length(scan(file,
what = "", sep = sep, quote = quote, nlines = 1,
quiet = TRUE, skip = 0, strip.white = strip.white,
blank.lines.skip = blank.lines.skip, comment.char = comment.char,
allowEscapes = allowEscapes, encoding = encoding,
skipNul = skipNul))
cols <- max(col1, col)
rlabp <- (cols - col1) == 1L
if (rlabp && missing(header))
header <- TRUE
if (!header)
rlabp <- FALSE
if (header) {
.External(C_readtablehead, file, 1L, comment.char,
blank.lines.skip, quote, sep, skipNul)
if (missing(col.names))
col.names <- first
else if (length(first) != length(col.names))
warning("header and 'col.names' are of different lengths")
}
else if (missing(col.names))
col.names <- paste0("V", 1L:cols)
if (length(col.names) + rlabp < cols)
stop("more columns than column names")
if (fill && length(col.names) > cols)
cols <- length(col.names)
if (!fill && cols > 0L && length(col.names) > cols)
stop("more column names than columns")
if (cols == 0L)
stop("first five rows are empty: giving up")
}
if (check.names)
col.names <- make.names(col.names, unique = TRUE)
if (rlabp)
col.names <- c("row.names", col.names)
nmColClasses <- names(colClasses)
if (is.null(nmColClasses)) {
if (length(colClasses) < cols)
colClasses <- rep_len(colClasses, cols)
}
else {
tmp <- rep_len(NA_character_, cols)
names(tmp) <- col.names
i <- match(nmColClasses, col.names, 0L)
if (any(i <= 0L))
warning("not all columns named in 'colClasses' exist")
tmp[i[i > 0L]] <- colClasses[i > 0L]
colClasses <- tmp
}
what <- rep.int(list(""), cols)
names(what) <- col.names
colClasses[colClasses %in% c("real", "double")] <- "numeric"
known <- colClasses %in% c("logical", "integer", "numeric",
"complex", "character", "raw")
what[known] <- sapply(colClasses[known], do.call, list(0))
what[colClasses %in% "NULL"] <- list(NULL)
keep <- !sapply(what, is.null)
data <- scan(file = file, what = what, sep = sep, quote = quote,
dec = dec, nmax = nrows, skip = 0, na.strings = na.strings,
quiet = TRUE, fill = fill, strip.white = strip.white,
blank.lines.skip = blank.lines.skip, multi.line = FALSE,
comment.char = comment.char, allowEscapes = allowEscapes,
flush = flush, encoding = encoding, skipNul = skipNul)
nlines <- length(data[[which.max(keep)]])
if (cols != length(data)) {
warning("cols = ", cols, " != length(data) = ", length(data),
domain = NA)
cols <- length(data)
}
if (is.logical(as.is)) {
as.is <- rep_len(as.is, cols)
}
else if (is.numeric(as.is)) {
if (any(as.is < 1 | as.is > cols))
stop("invalid numeric 'as.is' expression")
i <- rep.int(FALSE, cols)
i[as.is] <- TRUE
as.is <- i
}
else if (is.character(as.is)) {
i <- match(as.is, col.names, 0L)
if (any(i <= 0L))
warning("not all columns named in 'as.is' exist")
i <- i[i > 0L]
as.is <- rep.int(FALSE, cols)
as.is[i] <- TRUE
}
else if (length(as.is) != cols)
stop(gettextf("'as.is' has the wrong length %d != cols = %d",
length(as.is), cols), domain = NA)
do <- keep & !known
if (rlabp)
do[1L] <- FALSE
for (i in (1L:cols)[do]) {
data[[i]] <- if (is.na(colClasses[i]))
type.convert(data[[i]], as.is = as.is[i], dec = dec,
numerals = numerals, na.strings = character(0L))
else if (colClasses[i] == "factor")
as.factor(data[[i]])
else if (colClasses[i] == "Date")
as.Date(data[[i]])
else if (colClasses[i] == "POSIXct")
as.POSIXct(data[[i]])
else methods::as(data[[i]], colClasses[i])
}
compactRN <- TRUE
if (missing(row.names)) {
if (rlabp) {
row.names <- data[[1L]]
data <- data[-1L]
keep <- keep[-1L]
compactRN <- FALSE
}
else row.names <- .set_row_names(as.integer(nlines))
}
else if (is.null(row.names)) {
row.names <- .set_row_names(as.integer(nlines))
}
else if (is.character(row.names)) {
compactRN <- FALSE
if (length(row.names) == 1L) {
rowvar <- (1L:cols)[match(col.names, row.names, 0L) ==
1L]
row.names <- data[[rowvar]]
data <- data[-rowvar]
keep <- keep[-rowvar]
}
}
else if (is.numeric(row.names) && length(row.names) == 1L) {
compactRN <- FALSE
rlabp <- row.names
row.names <- data[[rlabp]]
data <- data[-rlabp]
keep <- keep[-rlabp]
}
else stop("invalid 'row.names' specification")
data <- data[keep]
if (is.object(row.names) || !(is.integer(row.names)))
row.names <- as.character(row.names)
if (!compactRN) {
if (length(row.names) != nlines)
stop("invalid 'row.names' length")
if (anyDuplicated(row.names))
stop("duplicate 'row.names' are not allowed")
if (anyNA(row.names))
stop("missing values in 'row.names' are not allowed")
}
class(data) <- "data.frame"
attr(data, "row.names") <- row.names
data
}
答案 1 :(得分:3)
这是一个基本代码,可用于仔细检查来自read.csv
和subset
的结果:
将read.csv
与readLines
这里有一个代码可以将read.csv
的结果与readLines
进行比较(函数逐行读取)
library("readr" )
library("rlist")
file1<-file.choose() #Select your csv file1
file2<-file.choose() #Select your csv file2
#readLines
input_list<-strsplit(readLines(file1),",")
db_readLines<-data.frame(list.rbind(input_list[2:length(input_list)]))
names(db_readLines)<-input_list[[1]]
#readd.csv
db_readcsv<-read.csv(file1,header = T,sep = ",")
#Comparison
if ((sum(db_readcsv==db_readLines)/(nrow(db_readcsv)*ncol(db_readcsv)))==1)
{
cat("Same data.frame")
} else
{
cat("Data.frames are differents")
}
您可以将其与csv文件一起使用来比较结果,并验证read.csv
是否将行顺序保留为readLines
。
将subset
与rbind
+基本过滤
关于问题的第二部分另一个简单的测试:
data1<-read.csv(file1, header=T,sep=",")
data2<-read.csv(file2, header=T,sep=",")
all_data<-rbind(data1, data2)
filtered1<-subset(all_data, foo=="a")
filtered2<-rbind(data1[data1$foo=="a",],data2[data2$foo=="a",])
#Comparison
if ((sum(filtered1==filtered2)/(nrow(filtered2)*ncol(filtered2)))==1)
{
cat("Same data.frame")
} else
{
cat("Data.frames are differents")
}
您可以在代码中包含此类测试,但显然这是低效且多余的。
答案 2 :(得分:3)
可以安全地假设所有这些函数(read.csv
,rbind
和subset
)都保证像原始csv一样保留数据的顺序。
就个人而言,我更喜欢使用dplyr::filter
而不是base::subset
。正如this中所解释的那样,这两项工作几乎相同。主要区别在于subset
在?subset
中附带警告:&#34;这是一个便于交互使用的便利功能。对于编程,最好使用标准的子集函数,如[
,特别是参数子集的非标准评估可能会产生意想不到的后果。&#34; filter
旨在通过交互式和编程方式与dplyr
和tidyverse
的其余部分进行稳健协作,并在必要时提供单独的标准评估版filter_
。所以也许filter
是一个更安全的赌注,特别是如果您已经使用dplyr
框架。我遇到filter
的唯一不利之处在于它不会保留rownames,而subset
会这样做。
无论哪种方式,我都不认为你需要担心行重新洗牌。根据我的经验,所有这些函数总是生成按原始数据排序的R对象。如果你想要非常小心,那么使用@ user127649的建议并添加一个唯一的ID列作为备份就不会有什么坏处。我总是赞成更懒惰的选择,但值得安心!