Question

我有一个数据集，其中每一行代表一个特定年份的国家。我还在每行（Neighbor1和Neighbor2）中都有两个变量，列出了该国家/地区的地理邻居的国家/地区代码。

示例数据集代码：

A=c("US", "Cuba", "France", "Germany", "Belgium", "US", "Cuba", "France", "Germany", "Belgium")  
B=c(2000, 2000, 2000, 2000, 2000, 2001, 2001, 2001, 2001, 2001)
C=c(2, 40, 220, 255, 211, 2, 40, 220, 255, 211)
D=c(5, 10, 11, 3, 8, 1, 7, 15, 6, 9)
E=c(40, 2, 211, 211, 220, 40, 2, 211, 211, 220)
G=c(NA, NA, 255, 220, 255, NA, NA, 255, 220, 255)
Example <- data.frame(A, B, C, D, E, G)
colnames(Example) <- c("Country", "Year", "CountryCode", "TerrorismDeaths", "Neighbor1", "Neighbor2")

数据集：

   Country Year CountryCode TerrorismDeaths Neighbor1 Neighbor2
1       US 2000           2               5        40        NA
2     Cuba 2000          40              10         2        NA
3   France 2000         220              11       211       255
4  Germany 2000         255               3       211       220
5  Belgium 2000         211               8       220       255
6       US 2001           2               1        40        NA
7     Cuba 2001          40               7         2        NA
8   France 2001         220              15       211       255
9  Germany 2001         255               6       211       220
10 Belgium 2001         211               9       220       255

我想做的是创建一个变量，该变量用于衡量该特定年份每个国家邻居的平均恐怖主义死亡人数。因此，我想添加一行如下所示：

所需的输出：

   Country Year CountryCode TerrorismDeaths Neighbor1 Neighbor2 NeighborAvgTerror
1       US 2000           2               5        40        NA              10.0
2     Cuba 2000          40              10         2        NA               5.0
3   France 2000         220              11       211       255               5.5
4  Germany 2000         255               3       211       220               9.5
5  Belgium 2000         211               8       220       255               7.0
6       US 2001           2               1        40        NA               7.0
7     Cuba 2001          40               7         2        NA               1.0
8   France 2001         220              15       211       255               7.5
9  Germany 2001         255               6       211       220              12.0
10 Belgium 2001         211               9       220       255              10.5

Answer 1

这可以通过三个步骤完成

将数据集的格式从宽格式转换为长格式
在自连接内聚合
使用更新联接将结果列追加到原始数据集

与data.table：

library(data.table)
long <- melt(setDT(Example), measure.vars = patterns("^Neighbor"), 
             value.name = "Neighbor", na.rm = TRUE)
agg <- long[long, on = .(Year, Neighbor = CountryCode), 
            mean(TerrorismDeaths), by = .EACHI]
Example[agg, on = .(Year, CountryCode = Neighbor), NeighborAvgTerror := V1]

Example[]

    Country Year CountryCode TerrorismDeaths Neighbor1 Neighbor2 NeighborAvgTerror
 1:      US 2000           2               5        40        NA              10.0
 2:    Cuba 2000          40              10         2        NA               5.0
 3:  France 2000         220              11       211       255               5.5
 4: Germany 2000         255               3       211       220               9.5
 5: Belgium 2000         211               8       220       255               7.0
 6:      US 2001           2               1        40        NA               7.0
 7:    Cuba 2001          40               7         2        NA               1.0
 8:  France 2001         220              15       211       255               7.5
 9: Germany 2001         255               6       211       220              12.0
10: Belgium 2001         211               9       220       255              10.5

Answer 2

也许有更好的方法，但这可行：

## Create an empty dataframe with a row for each country.
NeighborAvgTerror <- data.frame(NeighborAvgTerror=double(length(A)))

## Go through every country
for(i in 1:length(A)){
  ## Get neighbor code 
  n1_code<-Example$Neighbor1[i]
  n2_code<-Example$Neighbor2[i]
  year<-Example$Year[i]

  ## Find terrorism deaths for neighboring countries
  n1<-Example[which(Example$CountryCode == n1_code & Example$Year == year), ]
  n2<-Example[which(Example$CountryCode == n2_code & Example$Year == year), ]

  ## Average while ignoring the nulls
  NeighborAvgTerror$NeighborAvgTerror[i]<-mean(c(n1$TerrorismDeaths,n2$TerrorismDeaths),na.rm=T) 
}

## Append to Example dataframe
Example<-cbind(Example,NeighborAvgTerror)

我发现您的注释中有两个以上的邻居，您可以相应地编辑代码。

Answer 3

使用tidyverse包dplyr和tidyr

library(dplyr)
library(tidyr)

deaths <- Example %>% select(CountryCode, Year, TerrorismDeaths)
neighbors <- Example %>% select(CountryCode, Year, matches('Neighbor[0-9]+'))

Example %>% 
  gather(Neighbor_num, Neighbor, matches('Neighbor[0-9]+'), na.rm = T) %>% 
  left_join(deaths, by = c(Neighbor = 'CountryCode', 'Year'), suffix = c('', '_neighbor')) %>% 
  group_by(Country, Year, CountryCode, TerrorismDeaths) %>% 
  summarise(NeighborAvgTerror= mean(TerrorismDeaths_neighbor, na.rm = T)) %>% 
  arrange(Year, CountryCode) %>% 
  left_join(neighbors, by = c('CountryCode', 'Year'))

# # A tibble: 10 x 7
# # Groups:   Country, Year, CountryCode [?]
#    Country  Year CountryCode TerrorismDeaths NeighborAvgTerror Neighbor1 Neighbor2
#    <fct>   <dbl>       <dbl>           <dbl>             <dbl>     <dbl>     <dbl>
#  1 US       2000           2               5              10          40        NA
#  2 Cuba     2000          40              10               5           2        NA
#  3 Belgium  2000         211               8               7         220       255
#  4 France   2000         220              11               5.5       211       255
#  5 Germany  2000         255               3               9.5       211       220
#  6 US       2001           2               1               7          40        NA
#  7 Cuba     2001          40               7               1           2        NA
#  8 Belgium  2001         211               9              10.5       220       255
#  9 France   2001         220              15               7.5       211       255
# 10 Germany  2001         255               6              12         211       220

如果您需要首先建立一个邻近国家/邻国的数据集，则可以使用CoW Direct Contiguity dataset。

library(curl)
library(readr)
library(dplyr)
library(tidyr)

url <- 'http://www.correlatesofwar.org/data-sets/direct-contiguity/direct-contiguity-v3-2'
tmp <- tempfile()
curl_download(url, tmp)
csv_file <- unzip(tmp, files = 'DirectContiguity320/contdird.csv', exdir = tempdir())
unlink(tmp)

# level of separation options
# 1: Separated by a land or river border
# 2: Separated by 12 miles of water or less
# 3: Separated by 24 miles of water or less (but more than 12 miles)
# 4: Separated by 150 miles of water or less (but more than 24 miles)
# 5: Separated by 400 miles of water or less (but more than 150 miles)

neighbors <- 
  read_csv(csv_file, col_types = 'iiciciid') %>% 
  filter(year >= 2000) %>%  # filter to only years you need
  filter(conttype <= 4) %>%  # choose level of separation
  select(CountryCode = state1no, Year = year, neighbor = state2no) %>% 
  group_by(CountryCode, Year) %>% 
  mutate(Neighbor = row_number()) %>% 
  spread(Neighbor, neighbor, sep = '')

unlink(csv_file)

deaths <- Example %>% select(CountryCode, Year, TerrorismDeaths)

Example %>% 
  select(-Neighbor1, -Neighbor2) %>% 
  left_join(neighbors, by = c("Year", "CountryCode")) %>% 
  gather(Neighbor_num, Neighbor, matches('Neighbor[0-9]+'), na.rm = T) %>% 
  left_join(deaths, by = c(Neighbor = 'CountryCode', 'Year'), suffix = c('', '_neighbor')) %>% 
  group_by(Country, Year, CountryCode, TerrorismDeaths) %>% 
  summarise(NeighborAvgTerror = mean(TerrorismDeaths_neighbor, na.rm = T)) %>%
  arrange(Year, Country) %>%
  left_join(neighbors, by = c('CountryCode', 'Year'))

使用R中的其他行创建平均变量

3 个答案: