我有以下CSV文件:
data,key
"VA1,VA2,20140524,,0,0,5969,20140523134902,S7,S1147,140,20140523134902,m/t",4503632376496128
"VA2,VA3,20140711,,0,0,8824,20140601095714,S1,S6402,175,20140601095839,m/t",4503643113914368
我尝试用R读取它,但我不需要key
值,应该读取data
值来分隔列。使用以下代码,我几乎得到了我需要的东西:
data <- read.csv(fileCSV, header = FALSE, sep = ",", skip = 1, comment.char = "", quote = "")
我跳过标题行(skip = 1
),说我没有(header = FALSE
),并说我没有引号(quote = ""
})。但结果我得到V1和V13列中的引号字符和额外的V14列:
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14
1 "VA1 VA2 20140524 NA 0 0 5969 2.014121e+13 S7 S1147 140 2.014121e+13 m/t" 4.503608e+15
我应该在阅读csv后以某种方式删除它吗?或者,有没有更好的方法来阅读这样的csv文件?
UPD 即可。我使用以下方法删除引号:
data[,"V1"] = sub("^\"", "", data[,"V1"])
data[,"V13"] = sub("\"$", "", data[,"V13"])
但这些列的factor
类型已更改为character
。
答案 0 :(得分:2)
带有fread()
的系统命令怎么样?
writeLines(
'data,key
"VA1,VA2,20140524,,0,0,5969,20140523134902,S7,S1147,140,20140523134902,m/t",4503632376496128
"VA2,VA3,20140711,,0,0,8824,20140601095714,S1,S6402,175,20140601095839,m/t",4503643113914368', "x.txt"
)
require(bit64)
data.table::fread("cat x.txt | rev | cut -d '\"' -f2 | rev | tail -n +2")
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13
# 1: VA1 VA2 20140524 NA 0 0 5969 20140523134902 S7 S1147 140 20140523134902 m/t
# 2: VA2 VA3 20140711 NA 0 0 8824 20140601095714 S1 S6402 175 20140601095839 m/t
根据要求,这是对这两种方法的测试。
## 150k lines
writeLines(c("data,key\n", rep_len(
'"VA1,VA2,20140524,,0,0,5969,20140523134902,S7,S1147,140,20140523134902,m/t",4503632376496128\n', 1.5e5)),
"test.txt"
)
## fread() in well under 1 second (with bit64 loaded)
system.time({
dt <- data.table::fread(
"cat test.txt | rev | cut -d '\"' -f2 | rev | grep -e '^V'"
)
})
# user system elapsed
# 0.945 0.108 0.547
## your current read.csv() method in just over two seconds
system.time({
df <- read.csv("test.txt", header = FALSE, sep = ",", skip = 1, comment.char = "", quote = "")
df[,"V1"] = sub("^\"", "", df[,"V1"])
df[,"V13"] = sub("\"$", "", df[,"V13"])
})
# user system elapsed
# 2.134 0.000 2.129
dim(dt)
# [1] 150000 13
dim(df)
# [1] 150000 14