在R中总结有困难

时间:2018-06-29 15:37:27

标签: r data.table lapply

我正在尝试按特定的列进行汇总,但是遇到了问题,即未返回不同的值。

下面提供了我如何读取数据和正在使用的代码:

<video>

我相信下面的代码是上面的重点

library(data.table, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)

################
## PARAMETERS ##
################

# Set path of major source folder for raw transaction data
in_directory <- "C:/Users/NAME/Documents/Raw Data/"

# List names of sub-folders (currently grouped by first two characters of CUST_ID)
in_subfolders <- list("AA-CA", "CB-HZ", "IA-IL", "IM-KZ", "LA-MI", "MJ-MS",
                      "MT-NV", "NW-OH", "OI-PZ", "QA-TN", "TO-UZ",
                      "VA-WA", "WB-ZZ")

# Set location for output
out_directory <- "C:/Users/NAME/Documents/YTD Master/"
out_filename <- "TEST2.csv"

# Set beginning and end of date range to be collected - year-month-day format
date_range <- interval(as.Date("2018-01-01"), as.Date("2018-05-31"))

# Enable or disable filtering of raw files to only grab items bought within certain months to save space.
# If false, all files will be scanned for unique items, which will take longer and be a larger file.
date_filter <- TRUE


##########
## CODE ##
##########

starttime <- Sys.time()
mastertable <- NULL

for (j in 1:length(in_subfolders)) {
  subfolder <- in_subfolders[j]
  sub_directory <- paste0(in_directory, subfolder, "/")

  ## IMPORT DATA
  in_filenames <- dir(sub_directory, pattern =".txt")

  for (i in 1:length(in_filenames)) {

    # Default value provided for when fast filtering is disabled.
    read_this_file <- TRUE

    # To fast filter the data, we choose to include or exclude an entire file based on the date of its first line.
    # WARNING: This is only a valid method if filtering by entire months, since that is the amount of data housed in each file.
    if (date_filter) {
      temptable <- fread(paste0(sub_directory, in_filenames[i]), colClasses=c(CUSTOMER_TIER = "character"),
                         na.strings = "", nrows = 1)
      temptable[, INVOICE_DT := as.Date(INVOICE_DT)]

      # If date matches, set read flag to TRUE.  If date does not match, set read flag to FALSE.
      read_this_file <- temptable[, INVOICE_DT] %within% date_range
    }


    if (read_this_file) {
      print(Sys.time()-starttime)
      print(paste0("Reading in ", in_filenames[i]))
      temptable <- fread(paste0(sub_directory, in_filenames[i]), colClasses=c(CUSTOMER_TIER = "character"),
                         na.strings = "")


          temptable <- temptable[, lapply(.SD, sum), by = INV_ITEM_ID,
                             .SDcols = c("Ext Sale")]

      # Combine into full list
      mastertable <- rbindlist(list(mastertable, temptable), use.names = TRUE)
      # Release unneeded memory
      rm(temptable)


    }

  }

}

# Save Final table
print("Saving master table")
fwrite(mastertable, paste0(out_directory, out_filename))
rm(mastertable)

print(Sys.time()-starttime)

运行此代码后,我希望所有销售都将由INV_ITEM_ID合并。

问题是它给了我大约500万行,只有大约900,000个不同的值。

“销售”列没有像我想要的那样汇总。

下面是我正在使用的数据的子集。

  

dput(head(datatable,30))

 temptable <- temptable[, lapply(.SD, sum), by = INV_ITEM_ID,
                         .SDcols = c("Ext Sale")]

0 个答案:

没有答案