我尝试在Ubuntu下进行R代码优化(严重依赖data.table
)。
下面的代码显示了我对data.table对象进行的一些数据转换,包括:
我担心的是,从数据加载到完成这些计算的那一刻,RAM消耗都急剧增加。
RAM字节数与运行时间的秒数:
开始时,根据data.table
,我的object.size()
对象dat占用 1.2 Gb 。
这时,根据data.table
,我的object.size()
对象dat占用了 2.1 Gb 。
存在一种开销RAM消耗,占5.5-3-(2.1-1.2)= 1.6 Gb以上。这意味着我增加了对象大小,而RAM同时增加了。
问题:您能给我一些如何使用data.table进行相同转换的指导吗?
## date var
dat[, Date := as.Date(
as.POSIXct(as.numeric(When) / 1000, origin = "1970-01-01", tz = "utc"), format = "%Y-%m-%d"
)
]
## limit report dates to minimal needed range
date_tbl <- dat[, .N, by = Date]
if(
nrow(date_tbl) < 3 * minimum_train_sample / 7 * 5
) # suppose weekends are not busy at all in whole organization
{
stop('not enough historical data to run any detection: sparse dates with data')
}
report_min_date <- min(dat[, Date])
## limit report wheres to minimal available data
where_date_actions <- dat[,
{
min_date = min(Date)
max_date = max(Date)
unique_dates = length(unique(Date))
list(
min_date = min_date
, max_date = max_date
, unique_dates = unique_dates
)
}
, by = Where
]
dat <- dat[Where %in% where_date_actions[min_date <= (last_reported_date - last_days_predict - minimum_train_sample)
& unique_dates >= minimum_train_sample / 7 * 5
, Where]
, ]
if(nrow(dat) == 0)
{
stop('not enough data by Where to run any detection: none of the monitored subsystems accumulated enough length of train data')
}
## time vars
dat[, datetime_when := as.POSIXct(
as.numeric(When) / 1000
, origin = "1970-01-01"
, tz = "utc"
)
]
dat[, Hour:= format(
as.POSIXct(as.numeric(When) / 1000, origin = "1970-01-01", tz = "utc")
, "%Y-%m-%d %H"
)
]
gc()
## convert strings and names
var_names <- 'Who'
dat <- dat[, (var_names):= lapply(.SD[, var_names, with = F], function(x) tolower(as.character(x)))]
gc()
## replace blanks
dat[, ObjectPath:= ifelse(is.na(ObjectPath), 'n/a', ObjectPath)] # get rid of NAs
dat[, ObjectPath:= ifelse(ObjectPath == '', 'n/a', ObjectPath)] # get rid of ''
dat[, Workstation:= ifelse(is.na(Workstation), 'n/a', Workstation)] # get rid of NAs
dat[, Workstation:= ifelse(Workstation == '', 'n/a', Workstation)] # get rid of ''
dat[, Who:= ifelse(Who == '', 'n/a', Who)] # get rid of '' in Who
gc()
## time between successive events
setorder(dat, Who, datetime_when)
dat[
, next_time_diff_secs := as.numeric(
datetime_when - shift(datetime_when, 1)
, units = 'secs'
)
, by = Who
]
dat[, diff_object_action_after:= What_Action]
dat[, diff_object_action_before:= shift(What_Action, n = 1)]
## cumulative count
setorder(dat, Who, datetime_when)
dat[, counter := 1:.N, by = Who]