我正在尝试使用R为大约4000个城市生成单独的csv文件。为此我编写了一个小的for循环。
所有城市数据都存储在一个称为citys_files的数据框中。在for循环中,我将此合并数据帧拆分为每个城市的小型临时数据帧,并将此临时数据帧写入csv文件。
cities.list <- unique(cities_files$city_name)
for( cities.list in cities_files$city_name ) {
tmp <- subset(cities_files,city_name == cities.list)
cit.name <- unique(tmp$city_name)
fn = paste(paste(cit.name) ,".csv", sep = '')
write.csv(tmp,fn ,row.names = FALSE)
}
但是,此for循环大约需要3个小时才能生成这4000个文件。我可以使用for循环以外的其他方式来获得所需的结果吗?还是有更好的方法来优化此代码?我想减少生成这些文件所需的时间。
答案 0 :(得分:2)
您正在遍历所有观察结果。您可能需要考虑一种data.table
方法。
更新
正如@MichaelChirico所指出的,应使用data.table::split
方法。
library(data.table)
cities_files <- data.frame(bar = c(1:20000),
city_name = rep(paste0("city ", 1:200), 100),
foo = c(1:20000))
microbenchmark::microbenchmark(
khaynes = {
# library(data.table)
# Set the data.frame as a data.table
cities_files_dt <- data.table(cities_files)
lapply(unique(cities_files_dt[, city_name]), function(city)
fwrite(x = subset(cities_files_dt, city_name == city),
file = paste0(city, ".csv")))
},
MichaelChirico = {
cities_files_dt <- data.table(cities_files)
list_dt <- split(cities_files_dt, cities_files_dt$city_name)
for(i in 1:length(list_dt)) {
fwrite(list_dt[[i]], paste0(names(list_dt[i]), ".csv"))
}
},
times = 5
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# khaynes 661.0689 680.6768 698.2449 683.1407 719.8056 746.5323 5
# MichaelChirico 452.0800 456.5777 499.2832 458.0174 517.4398 612.3011 5
使用data.table
软件包应大大加快处理时间:
library(data.table)
# Create a dummy data.frame
cities_files <- data.frame(bar = c(1, 1, 2, 3, 3),
city_name = c("city a", "city a", "city b", "city c", "city c"),
foo = c(20, 14, 40, 50, 60))
# Set the data.frame as a data.table
setDT(cities_files)
lapply(unique(cities_files[, city_name ]), function(city)
fwrite(x = subset(cities_files, city_name == city),
file = paste0(city, ".csv")))