在R
中是否有一种有效的方式来阅读转置的.csv
文件?
例如,请考虑以下文本文件:
Name,Peter,Paul,Marry
Age,40,9,38
这可以使用以下内容读入包含有用列类的data.table
:
library(data.table)
file <- tempfile("tmp.txt")
writeLines("Name,Peter,Paul,Mary\nAge,40,5,38\n", file)
lines <- readLines(file)
lines <- lapply(lines, function(x) gsub(pattern=",", replacement="\n", x, fixed=TRUE))
lines <- lapply(lines[-3], fread)
do.call(cbind,lines)
#> Name Age
#> 1: Peter 40
#> 2: Paul 5
#> 3: Mary 38
有没有更简单的方法来实现这一目标?是否有更高效的版本(我的文件是1 GB)?
请注意,对于列式存储,此类列主存储应该比data.table
更容易阅读。
答案 0 :(得分:3)
这是评论中@Dirk Eddelbuettel建议方法的实现。
String name = "yyyymmddword";
int year = Integer.parseInt(name.substring(0, 4));
int day = Integer.parseInt(name.substring(4, 6));
int month = Integer.parseInt(name.substring(6, 8));
String year = Integer.parseInt(name.substring(8));
答案 1 :(得分:3)
DT=setDT(read.table(text=do.call(paste,transpose(fread(file,h=F))),h=T,stringsAsFactors = F))
DT
Name Age
1: Peter 40
2: Paul 5
3: Mary 38
sapply(DT,class)
Name Age
"character" "integer"
答案 2 :(得分:2)
不幸的是,tfread
似乎不存在。
200 observations of 20000 character/integer variables
/ 20000 observations of 200 character/integer variables
的建议解决方案的时间安排:
readLines-fread
(@jan-glx):7 s / 1.2 s fread
-transpose
-paste
-read.table
(@Onymambu):8 s / 36 s fread
-transpose
-write.csv
-paste
-fread
(@Clayton Stanley):5分钟/ 12秒fread
(@jan-glx):2.4 s / 1.6 s fread
- transpose
- type.convert
(@ Frank):4.2 s / 3.6 s 代码:
library(data.table)
file <- tempfile("tmp.txt")
p <- 100 # = 200 lines/columns
n <- 10000 # = 20000 values per line / rows
writeLines(rep(c(paste("Name",paste0(rep(c("Peter","Paul"), n), collapse = ","), sep=","),
paste("Age",paste0(rep(c("40","5"), n), collapse = ","), sep=",")
), p), file(file,"wb"))
system.time({ # 1
lines <- readLines(file)
lines <- lapply(lines, function(x) gsub(pattern=",", replacement="\n", x, fixed=TRUE))
lines <- lapply(lines, fread)
dt <- do.call(cbind,lines)
dim(dt)
})
system.time({ # 2
DT=setDT(read.table(text=do.call(paste,transpose(fread(file,h=F))),h=T,stringsAsFactors = F))
dim(DT)
})
system.time({ # 3
aTbl = fread(file, colClasses="character", header=F)
invisible(
aTbl[, .SD
][, transpose(.SD)
][, setnames(.SD, .SD[1, t(.SD)])
][2:.N
][, fread(paste0(capture.output(write.csv(.SD, stdout(), row.names=F, quote=F)), collapse='\n'))
][, {bTbl <<- copy(.SD); .SD}
]
)
dim(bTbl)
})
system.time({ # 4 wide
dt <- fread(paste0("transpose -t -l 20005x205 --fsep , \"", file, "\""))
dim(dt)
})
system.time({ # 4 long
dt <- fread(paste0("transpose -t -l 205x20005 --fsep , \"", file, "\""))
dim(dt)
})
system.time({ # 5
infile <- file(file, "rb")
df <- iotools::chunk.tapply(infile, function(x) {
fread(paste0(apply(iotools::mstrsplit(x, sep=","), 2, paste0, collapse = ","), collapse = "\n"))
}, CH.MERGE = cbind)
dim(df)
})
system.time({ # 6
d <- fread(file, header=FALSE);
d <- d[, lapply(transpose(.SD[,-1]), type.convert)][, setnames(.SD, d[[1]])]
dim(d)
})
答案 3 :(得分:2)
将@lmo的评论扩展为使用iotools
:
dt <- iotools::chunk.tapply(file(file, "rb"), function(x) {
fread(paste0(apply(iotools::mstrsplit(x, sep=","), 2, paste0, collapse = ","), collapse = "\n"))
}, CH.MERGE = cbind)
这比其他现有解决方案更有效。
答案 4 :(得分:1)
扩展@ngm的注释以使用命令行工具:
gcc transpose.c -o transpose
这需要了解表的粗略大小,并要求单个字符行结尾,即fread(paste0("transpose -t -l 205x20005 --fsep , \"", file, "\""))
。它非常不方便,但比其他解决方案更快。