Question

我的数据顺序很重要。如果我使用read.csv将CSV加载到R中，那么数据框保证中的行的顺序是否与CSV的行匹配？

如果我将一堆CSV和rbind加在一起，然后使用subset来获取我感兴趣的数据呢？

例如：

1.csv

foo,bar
a,123
a,456
c,789

2.csv

foo,bar
d,987
a,999
b,654
a,321

以下是：

data1<-read.csv("1.csv", header=T)
data2<-read.csv("2.csv", header=T)
all_data<-rbind(data1, data2)
filtered<-subset(all_data, foo=="a")

... 始终生成filtered：

   foo  bar
1    a  123
2    a  456
3    a  999
4    a  321

...并且这种行为是否适用于任意CSV输入和过滤器？

Answer 1

阅读read.table的源代码。它使用scan基函数，它本身使用file和textConnection函数。所有这些似乎都指向您能够按顺序读取数据（“行”基于分隔符“行”）并将其输入。

function (file, header = FALSE, sep = "", quote = "\"'", dec = ".", 
    numerals = c("allow.loss", "warn.loss", "no.loss"), row.names, 
    col.names, as.is = !stringsAsFactors, na.strings = "NA", 
    colClasses = NA, nrows = -1, skip = 0, check.names = TRUE, 
    fill = !blank.lines.skip, strip.white = FALSE, blank.lines.skip = TRUE, 
    comment.char = "#", allowEscapes = FALSE, flush = FALSE, 
    stringsAsFactors = default.stringsAsFactors(), fileEncoding = "", 
    encoding = "unknown", text, skipNul = FALSE) 
{
    if (missing(file) && !missing(text)) {
        file <- textConnection(text, encoding = "UTF-8")
        encoding <- "UTF-8"
        on.exit(close(file))
    }
    if (is.character(file)) {
        file <- if (nzchar(fileEncoding)) 
            file(file, "rt", encoding = fileEncoding)
        else file(file, "rt")
        on.exit(close(file))
    }
    if (!inherits(file, "connection")) 
        stop("'file' must be a character string or connection")
    if (!isOpen(file, "rt")) {
        open(file, "rt")
        on.exit(close(file))
    }
    pbEncoding <- if (encoding %in% c("", "bytes", "UTF-8")) 
        encoding
    else "bytes"
    numerals <- match.arg(numerals)
    if (skip > 0L) 
        readLines(file, skip)
    nlines <- n0lines <- if (nrows < 0L) 
        5
    else min(5L, (header + nrows))
    lines <- .External(C_readtablehead, file, nlines, comment.char, 
        blank.lines.skip, quote, sep, skipNul)
    if (encoding %in% c("UTF-8", "latin1")) 
        Encoding(lines) <- encoding
    nlines <- length(lines)
    if (!nlines) {
        if (missing(col.names)) 
            stop("no lines available in input")
        rlabp <- FALSE
        cols <- length(col.names)
    }
    else {
        if (all(!nzchar(lines))) 
            stop("empty beginning of file")
        if (nlines < n0lines && file == 0L) {
            pushBack(c(lines, lines, ""), file, encoding = pbEncoding)
            on.exit((clearPushBack(stdin())))
        }
        else pushBack(c(lines, lines), file, encoding = pbEncoding)
        first <- scan(file, what = "", sep = sep, quote = quote, 
            nlines = 1, quiet = TRUE, skip = 0, strip.white = TRUE, 
            blank.lines.skip = blank.lines.skip, comment.char = comment.char, 
            allowEscapes = allowEscapes, encoding = encoding, 
            skipNul = skipNul)
        col1 <- if (missing(col.names)) 
            length(first)
        else length(col.names)
        col <- numeric(nlines - 1L)
        if (nlines > 1L) 
            for (i in seq_along(col)) col[i] <- length(scan(file, 
                what = "", sep = sep, quote = quote, nlines = 1, 
                quiet = TRUE, skip = 0, strip.white = strip.white, 
                blank.lines.skip = blank.lines.skip, comment.char = comment.char, 
                allowEscapes = allowEscapes, encoding = encoding, 
                skipNul = skipNul))
        cols <- max(col1, col)
        rlabp <- (cols - col1) == 1L
        if (rlabp && missing(header)) 
            header <- TRUE
        if (!header) 
            rlabp <- FALSE
        if (header) {
            .External(C_readtablehead, file, 1L, comment.char, 
                blank.lines.skip, quote, sep, skipNul)
            if (missing(col.names)) 
                col.names <- first
            else if (length(first) != length(col.names)) 
                warning("header and 'col.names' are of different lengths")
        }
        else if (missing(col.names)) 
            col.names <- paste0("V", 1L:cols)
        if (length(col.names) + rlabp < cols) 
            stop("more columns than column names")
        if (fill && length(col.names) > cols) 
            cols <- length(col.names)
        if (!fill && cols > 0L && length(col.names) > cols) 
            stop("more column names than columns")
        if (cols == 0L) 
            stop("first five rows are empty: giving up")
    }
    if (check.names) 
        col.names <- make.names(col.names, unique = TRUE)
    if (rlabp) 
        col.names <- c("row.names", col.names)
    nmColClasses <- names(colClasses)
    if (is.null(nmColClasses)) {
        if (length(colClasses) < cols) 
            colClasses <- rep_len(colClasses, cols)
    }
    else {
        tmp <- rep_len(NA_character_, cols)
        names(tmp) <- col.names
        i <- match(nmColClasses, col.names, 0L)
        if (any(i <= 0L)) 
            warning("not all columns named in 'colClasses' exist")
        tmp[i[i > 0L]] <- colClasses[i > 0L]
        colClasses <- tmp
    }
    what <- rep.int(list(""), cols)
    names(what) <- col.names
    colClasses[colClasses %in% c("real", "double")] <- "numeric"
    known <- colClasses %in% c("logical", "integer", "numeric", 
        "complex", "character", "raw")
    what[known] <- sapply(colClasses[known], do.call, list(0))
    what[colClasses %in% "NULL"] <- list(NULL)
    keep <- !sapply(what, is.null)
    data <- scan(file = file, what = what, sep = sep, quote = quote, 
        dec = dec, nmax = nrows, skip = 0, na.strings = na.strings, 
        quiet = TRUE, fill = fill, strip.white = strip.white, 
        blank.lines.skip = blank.lines.skip, multi.line = FALSE, 
        comment.char = comment.char, allowEscapes = allowEscapes, 
        flush = flush, encoding = encoding, skipNul = skipNul)
    nlines <- length(data[[which.max(keep)]])
    if (cols != length(data)) {
        warning("cols = ", cols, " != length(data) = ", length(data), 
            domain = NA)
        cols <- length(data)
    }
    if (is.logical(as.is)) {
        as.is <- rep_len(as.is, cols)
    }
    else if (is.numeric(as.is)) {
        if (any(as.is < 1 | as.is > cols)) 
            stop("invalid numeric 'as.is' expression")
        i <- rep.int(FALSE, cols)
        i[as.is] <- TRUE
        as.is <- i
    }
    else if (is.character(as.is)) {
        i <- match(as.is, col.names, 0L)
        if (any(i <= 0L)) 
            warning("not all columns named in 'as.is' exist")
        i <- i[i > 0L]
        as.is <- rep.int(FALSE, cols)
        as.is[i] <- TRUE
    }
    else if (length(as.is) != cols) 
        stop(gettextf("'as.is' has the wrong length %d  != cols = %d", 
            length(as.is), cols), domain = NA)
    do <- keep & !known
    if (rlabp) 
        do[1L] <- FALSE
    for (i in (1L:cols)[do]) {
        data[[i]] <- if (is.na(colClasses[i])) 
            type.convert(data[[i]], as.is = as.is[i], dec = dec, 
                numerals = numerals, na.strings = character(0L))
        else if (colClasses[i] == "factor") 
            as.factor(data[[i]])
        else if (colClasses[i] == "Date") 
            as.Date(data[[i]])
        else if (colClasses[i] == "POSIXct") 
            as.POSIXct(data[[i]])
        else methods::as(data[[i]], colClasses[i])
    }
    compactRN <- TRUE
    if (missing(row.names)) {
        if (rlabp) {
            row.names <- data[[1L]]
            data <- data[-1L]
            keep <- keep[-1L]
            compactRN <- FALSE
        }
        else row.names <- .set_row_names(as.integer(nlines))
    }
    else if (is.null(row.names)) {
        row.names <- .set_row_names(as.integer(nlines))
    }
    else if (is.character(row.names)) {
        compactRN <- FALSE
        if (length(row.names) == 1L) {
            rowvar <- (1L:cols)[match(col.names, row.names, 0L) == 
                1L]
            row.names <- data[[rowvar]]
            data <- data[-rowvar]
            keep <- keep[-rowvar]
        }
    }
    else if (is.numeric(row.names) && length(row.names) == 1L) {
        compactRN <- FALSE
        rlabp <- row.names
        row.names <- data[[rlabp]]
        data <- data[-rlabp]
        keep <- keep[-rlabp]
    }
    else stop("invalid 'row.names' specification")
    data <- data[keep]
    if (is.object(row.names) || !(is.integer(row.names))) 
        row.names <- as.character(row.names)
    if (!compactRN) {
        if (length(row.names) != nlines) 
            stop("invalid 'row.names' length")
        if (anyDuplicated(row.names)) 
            stop("duplicate 'row.names' are not allowed")
        if (anyNA(row.names)) 
            stop("missing values in 'row.names' are not allowed")
    }
    class(data) <- "data.frame"
    attr(data, "row.names") <- row.names
    data
}

Answer 2

这是一个基本代码，可用于仔细检查来自read.csv和subset的结果：

将read.csv与readLines

进行比较

这里有一个代码可以将read.csv的结果与readLines进行比较（函数逐行读取）

  library("readr" )
  library("rlist")
  file1<-file.choose() #Select your csv file1
  file2<-file.choose() #Select your csv file2

  #readLines
  input_list<-strsplit(readLines(file1),",")
  db_readLines<-data.frame(list.rbind(input_list[2:length(input_list)]))
  names(db_readLines)<-input_list[[1]]

  #readd.csv
  db_readcsv<-read.csv(file1,header = T,sep = ",")

  #Comparison
  if ((sum(db_readcsv==db_readLines)/(nrow(db_readcsv)*ncol(db_readcsv)))==1)
  {
    cat("Same data.frame")
  } else
  {
    cat("Data.frames are differents")
  }

您可以将其与csv文件一起使用来比较结果，并验证read.csv是否将行顺序保留为readLines。

将subset与rbind +基本过滤

进行比较

关于问题的第二部分另一个简单的测试：

data1<-read.csv(file1, header=T,sep=",")
  data2<-read.csv(file2, header=T,sep=",")
  all_data<-rbind(data1, data2)
  filtered1<-subset(all_data, foo=="a")

  filtered2<-rbind(data1[data1$foo=="a",],data2[data2$foo=="a",])

  #Comparison
  if ((sum(filtered1==filtered2)/(nrow(filtered2)*ncol(filtered2)))==1)
  {
    cat("Same data.frame")
  } else
  {
    cat("Data.frames are differents")
  }

您可以在代码中包含此类测试，但显然这是低效且多余的。

Answer 3

可以安全地假设所有这些函数（read.csv，rbind和subset）都保证像原始csv一样保留数据的顺序。

就个人而言，我更喜欢使用dplyr::filter而不是base::subset。正如this中所解释的那样，这两项工作几乎相同。主要区别在于subset在?subset中附带警告：＆＃34;这是一个便于交互使用的便利功能。对于编程，最好使用标准的子集函数，如[，特别是参数子集的非标准评估可能会产生意想不到的后果。＆＃34; filter旨在通过交互式和编程方式与dplyr和tidyverse的其余部分进行稳健协作，并在必要时提供单独的标准评估版filter_。所以也许filter是一个更安全的赌注，特别是如果您已经使用dplyr框架。我遇到filter的唯一不利之处在于它不会保留rownames，而subset会这样做。

无论哪种方式，我都不认为你需要担心行重新洗牌。根据我的经验，所有这些函数总是生成按原始数据排序的R对象。如果你想要非常小心，那么使用@ user127649的建议并添加一个唯一的ID列作为备份就不会有什么坏处。我总是赞成更懒惰的选择，但值得安心！

R中保留的CSV订单

3 个答案: