Question

我正在使用大型数据集。我的目标是获得特定国家/地区的特定事件（如数据集中编码）的总和。数据集太大了，我必须按月用函数加载它。数据来自GDELT数据集，可在此处获取：http://gdelt.utdallas.edu/data/backfiles/?O=D我已将csv转换为Rdata，以便更快地进行读写。它是一个包含57个不同变量的数据集。

# Create empty dataframes for all countries to later store data in.
Countries <- c("MAR","DZA","TUN","LBY","EGY","ISR",
               "JOR","SYR","TUR","GEO","UKR","RUS","BLR")
loadNames <- function(CountryName) {
  a <- data.frame()
  assign(CountryName, a, pos = .GlobalEnv)
}
lapply(Countries,loadNames)

loadMonth <- function(MonthName) {
  pb <- txtProgressBar(min = 0, max = total, initial = 0, char = "=", style = 1, width = 10)

  # Load the month.
  load(paste("/Users/mennoschellekens/Dropbox/HCSS-workinprogress/GDELT/Rdata/",MonthName,".RData", sep = ""), envir=environment())
  colnames(Month) <- names(Header.57)

  # Create a subset of relevant data for faster looping.
  y <- subset(Month, ((Actor1CountryCode == "SYR" | Actor1CountryCode =="MAR" | Actor1CountryCode =="DZA" | Actor1CountryCode == "TUN" | Actor1CountryCode == "LBY" | Actor1CountryCode == "EGY" | Actor1CountryCode == "ISR" | Actor1CountryCode == "JOR" | Actor1CountryCode == "TUR" | Actor1CountryCode == "GEO" | Actor1CountryCode == "UKR" | Actor1CountryCode == "RUS" | Actor1CountryCode == "BLR") & (Actor2CountryCode == "SYR" | Actor2CountryCode == "MAR" | Actor2CountryCode == "DZA" | Actor2CountryCode == "TUN" | Actor2CountryCode == "LBY" | Actor2CountryCode == "EGY" | Actor2CountryCode == "ISR" | Actor2CountryCode == "JOR" | Actor2CountryCode == "TUR" | Actor2CountryCode == "GEO" | Actor2CountryCode == "UKR" | Actor2CountryCode == "RUS" | Actor2CountryCode == "BLR")))

  #Define the events I want. 
  QuadCat <- c(1,2,3,4)

  # Define the countries I want.
  CountryString <- c("MAR","DZA", "TUN","LBY","EGY","ISR",
                     "JOR","SYR","TUR","GEO","UKR","RUS","BLR")
  CountryData <- c(MAR,DZA,TUN,LBY,EGY,ISR,JOR,SYR,TUR,GEO,UKR,RUS,BLR)

  # I want to check the above events for each country, using the function 'Check Events' with an embedded 'for loop'.
  CheckEvents <- function(CountryData,CountryString) {
    x <- subset(y, ((Actor1CountryCode == CountryString) & (Actor2CountryCode == CountryString)))

    # This is the problem:
    for (Y in QuadCat) {
      e[[Y]] <- (sum(x$QuadClass == Y))
      e <- rbind(CountryData,c(e))
      assign(CountryString, as.data.frame(e), pos = .GlobalEnv)
    }
  }
  mapply(CheckEvents, CountryData = CountryData, CountryString = CountryString)    
} ###### END

第一次运行的输出给出一个带有四个数字的向量，这是好的，然后存储在全局环境中。但是，当我尝试将该结果绑定到rbind()，cbind()或merge()的新结果时，它会给我非常奇怪的结果。最值得注意的是，它拒绝将CountryData作为向量读取，但仅取向量中的最后一个值。我不明白它在做什么，为什么它不会绑定。

Answer 1

我认为以下内容可以为您提供所需内容。然后，您可以根据需要重塑它。但是，您的示例代码令人难以置信地混淆并涉及一些潜在的危险元素，例如在函数内部使用assign和subset。你似乎也有很多多余的代码。我想所有这些都是造成你问题的原因。

Countries <- c("MAR","DZA","TUN","LBY","EGY","ISR",
               "JOR","SYR","TUR","GEO","UKR","RUS","BLR")

loadMonth <- function(MonthName) {
  pb <- txtProgressBar(min = 0, max = total, initial = 0, char = "=", style = 1, width = 10)

  # Load the month
  load(paste("/Users/mennoschellekens/Dropbox/HCSS-workinprogress/GDELT/Rdata/",MonthName,".RData", sep = ""), envir=environment())
  colnames(Month) <- names(Header.57)

  y <- Month[(Month$Actor1CountryCode %in% Countries) &
             (Month$Actor2CountryCode %in% Countries),] # subset
  CheckEvents <- function(CountryString) {
    x <- y[(y$Actor1CountryCode == CountryString) &
           (y$Actor2CountryCode == CountryString),]
    # 1:4 below was your values of QuadCat
    sapply(1:4, function(Y) sum(x$QuadClass == Y)) # should return a vector
  }
  # build list of vectors from `CheckEvents`, one for each country
  out <- lapply(Countries, CheckEvents)
  names(out) <- Countries
  return(out)
}
loadMonth("January") # get the list for one month; not sure how you have months named

R - 将循环中的向量绑定到一个data.frame中

1 个答案: