我有一个巨大的字符集数据集(14GB,200百万行)。我已经担心了(在48核128 GB服务器上花了30分钟以上的时间)。该字符串包含有关各个字段的连接的信息。例如,表格的第一行如下所示:
2014120900000001091500bbbbcompany_name00032401
其中前8个字符以YYYYMMDD格式表示date
,后8个字符为id
,后6个是HHMMSS格式的time
,然后后16个是name
(带有b的前缀),最后8个是price
(小数点后2位)。
我需要将上述1列data.table转换为5列:date, id, time, name, price
。
对于上述字符向量,结果将为:date = "2014-12-09", id = 1, time = "09:15:00", name = "company_name", price = 324.01
我正在寻找(非常)快速高效的dplyr / data.table解决方案。现在,我正在使用substr
:
date = as.Date(substr(d, 1, 8), "%Y%m%d");
而且要花很多时间才能执行!
更新:使用
readr::read_fwf
,我可以在5-10分钟内读取文件。显然,读取速度比fread
快。下面是代码:
f = "file_name";
num_cols = 5;
col_widths = c(8,8,6,16,8);
col_classes = "ciccn";
col_names = c("date", "id", "time", "name", "price");
# takes 5-10 mins
data = readr::read_fwf(file = f, col_positions = readr::fwf_widths(col_widths, col_names), col_types = col_classes, progress = T);
setDT(data);
# object.size(data) / 2^30; # 17.5 GB
答案 0 :(得分:1)
可能的解决方案:
{
"listings": [
{
"name": "Prod-1",
"manufacturedDate": "12-03-18"
},
{
"name": "Prod-2",
"manufacturedDate": "25-03-18"
},
{
"name": "Prod-1",
"manufacturedDate": "12-09-18"
}
],
"sortCriteria": [
{
"sortText": "Price - Low To High",
"sortFor": "Price",
"sortOrder": "ASC",
"sortUrl": "https://api.to.bring.products.orderedBy.Price"
},
{
"sortText": "Price - High To Low",
"sortFor": "Price",
"sortOrder": "DESC",
"sortUrl": "https://api.to.bring.products.orderedBy.Price.Desc"
},
{
"sortText": "ManufacturedDate - Latest To Oldest",
"sortFor": "Year",
"sortOrder": "DESC",
"sortUrl": "https://api.to.bring.products.orderedBy.ManufacturedDateTime.Desc"
},
{
"sortText": "ManufacturedDateTime - Oldest To Latest",
"sortFor": "Year",
"sortOrder": "ASC",
"sortUrl": "https://api.to.bring.products.orderedBy.ManufacturedDateTime.Asc"
}
]
}
给出:
library(data.table) library(stringi) widths <- c(8,8,6,16,8) sp <- c(1, cumsum(widths[-length(widths)]) + 1) ep <- cumsum(widths) DT[, lapply(seq_along(sp), function(i) stri_sub(V1, sp[i], ep[i]))]
包括一些其他处理以获得所需的结果:
V1 V2 V3 V4 V5
1: 20141209 00000001 091500 bbbbcompany_name 00032401
给出:
DT[, lapply(seq_along(sp), function(i) stri_sub(V1, sp[i], ep[i])) ][, .(date = as.Date(V1, "%Y%m%d"), id = as.integer(V2), time = as.ITime(V3, "%H%M%S"), name = sub("^(bbbb)","",V4), price = as.numeric(V5)/100)]
但是您实际上正在读取固定宽度的文件。因此,也可以考虑从基数R来的 date id time name price
1: 2014-12-09 1 09:15:00 company_name 324.01
或从readr来的read.fwf
,或者像我前一阵子那样编写自己的read_fwf
函数:
fread.fwf
使用的数据:
fread.fwf <- function(file, widths, enc = "UTF-8") {
sp <- c(1, cumsum(widths[-length(widths)]) + 1)
ep <- cumsum(widths)
fread(file = file, header = FALSE, sep = "\n", encoding = enc)[, lapply(seq_along(sp), function(i) stri_sub(V1, sp[i], ep[i]))]
}
答案 1 :(得分:0)
也许您的解决方案还不错。
我正在使用以下数据:
df <- data.table(text = rep("2014120900000001091500bbbbcompany_name00032401", 100000))
您的解决方案:
> system.time(df[, .(date = as.Date(substr(text, 1, 8), "%Y%m%d"),
+ id = as.integer(substr(text, 9, 16)),
+ time = substr(text, 17, 22),
+ name = substr(text, 23, 38),
+ price = as.numeric(substr(text, 39, 46))/100)])
user system elapsed
0.17 0.00 0.17
@Jaap解决方案:
> library(data.table)
> library(stringi)
>
> widths <- c(8,8,6,16,8)
> sp <- c(1, cumsum(widths[-length(widths)]) + 1)
> ep <- cumsum(widths)
>
> system.time(df[, lapply(seq_along(sp), function(i) stri_sub(text, sp[i], ep[i]))
+ ][, .(date = as.Date(V1, "%Y%m%d"),
+ id = as.integer(V2),
+ time = V3,
+ name = sub("^(bbbb)","",V4),
+ price = as.numeric(V5)/100)])
user system elapsed
0.20 0.00 0.21
尝试使用read.fwf:
> setClass("myDate")
> setAs("character","myDate", function(from) as.Date(from, format = "%Y%m%d"))
> setClass("myNumeric")
> setAs("character","myNumeric", function(from) as.numeric(from)/100)
>
> ff <- function(x) {
+ file <- textConnection(x)
+ read.fwf(file, c(8, 8, 6, 16, 8),
+ col.names = c("date", "id", "time", "name", "price"),
+ colClasses = c("myDate", "integer", "character", "character", "myNumeric"))
+ }
>
> system.time(df[, as.list(ff(text))])
user system elapsed
2.33 6.15 8.49
所有输出都相同。
答案 2 :(得分:-1)
也许尝试使用带有数字的矩阵而不是data.frame。聚合应该花费更少的时间。