我有一个数据集,其中每一行代表一个特定年份的国家。我还在每行(Neighbor1
和Neighbor2
)中都有两个变量,列出了该国家/地区的地理邻居的国家/地区代码。
示例数据集代码:
A=c("US", "Cuba", "France", "Germany", "Belgium", "US", "Cuba", "France", "Germany", "Belgium")
B=c(2000, 2000, 2000, 2000, 2000, 2001, 2001, 2001, 2001, 2001)
C=c(2, 40, 220, 255, 211, 2, 40, 220, 255, 211)
D=c(5, 10, 11, 3, 8, 1, 7, 15, 6, 9)
E=c(40, 2, 211, 211, 220, 40, 2, 211, 211, 220)
G=c(NA, NA, 255, 220, 255, NA, NA, 255, 220, 255)
Example <- data.frame(A, B, C, D, E, G)
colnames(Example) <- c("Country", "Year", "CountryCode", "TerrorismDeaths", "Neighbor1", "Neighbor2")
数据集:
Country Year CountryCode TerrorismDeaths Neighbor1 Neighbor2
1 US 2000 2 5 40 NA
2 Cuba 2000 40 10 2 NA
3 France 2000 220 11 211 255
4 Germany 2000 255 3 211 220
5 Belgium 2000 211 8 220 255
6 US 2001 2 1 40 NA
7 Cuba 2001 40 7 2 NA
8 France 2001 220 15 211 255
9 Germany 2001 255 6 211 220
10 Belgium 2001 211 9 220 255
我想做的是创建一个变量,该变量用于衡量该特定年份每个国家邻居的平均恐怖主义死亡人数。因此,我想添加一行如下所示:
所需的输出:
Country Year CountryCode TerrorismDeaths Neighbor1 Neighbor2 NeighborAvgTerror
1 US 2000 2 5 40 NA 10.0
2 Cuba 2000 40 10 2 NA 5.0
3 France 2000 220 11 211 255 5.5
4 Germany 2000 255 3 211 220 9.5
5 Belgium 2000 211 8 220 255 7.0
6 US 2001 2 1 40 NA 7.0
7 Cuba 2001 40 7 2 NA 1.0
8 France 2001 220 15 211 255 7.5
9 Germany 2001 255 6 211 220 12.0
10 Belgium 2001 211 9 220 255 10.5
答案 0 :(得分:1)
这可以通过三个步骤完成
与data.table
:
library(data.table)
long <- melt(setDT(Example), measure.vars = patterns("^Neighbor"),
value.name = "Neighbor", na.rm = TRUE)
agg <- long[long, on = .(Year, Neighbor = CountryCode),
mean(TerrorismDeaths), by = .EACHI]
Example[agg, on = .(Year, CountryCode = Neighbor), NeighborAvgTerror := V1]
Example[]
Country Year CountryCode TerrorismDeaths Neighbor1 Neighbor2 NeighborAvgTerror 1: US 2000 2 5 40 NA 10.0 2: Cuba 2000 40 10 2 NA 5.0 3: France 2000 220 11 211 255 5.5 4: Germany 2000 255 3 211 220 9.5 5: Belgium 2000 211 8 220 255 7.0 6: US 2001 2 1 40 NA 7.0 7: Cuba 2001 40 7 2 NA 1.0 8: France 2001 220 15 211 255 7.5 9: Germany 2001 255 6 211 220 12.0 10: Belgium 2001 211 9 220 255 10.5
答案 1 :(得分:0)
也许有更好的方法,但这可行:
## Create an empty dataframe with a row for each country.
NeighborAvgTerror <- data.frame(NeighborAvgTerror=double(length(A)))
## Go through every country
for(i in 1:length(A)){
## Get neighbor code
n1_code<-Example$Neighbor1[i]
n2_code<-Example$Neighbor2[i]
year<-Example$Year[i]
## Find terrorism deaths for neighboring countries
n1<-Example[which(Example$CountryCode == n1_code & Example$Year == year), ]
n2<-Example[which(Example$CountryCode == n2_code & Example$Year == year), ]
## Average while ignoring the nulls
NeighborAvgTerror$NeighborAvgTerror[i]<-mean(c(n1$TerrorismDeaths,n2$TerrorismDeaths),na.rm=T)
}
## Append to Example dataframe
Example<-cbind(Example,NeighborAvgTerror)
我发现您的注释中有两个以上的邻居,您可以相应地编辑代码。
答案 2 :(得分:0)
library(dplyr)
library(tidyr)
deaths <- Example %>% select(CountryCode, Year, TerrorismDeaths)
neighbors <- Example %>% select(CountryCode, Year, matches('Neighbor[0-9]+'))
Example %>%
gather(Neighbor_num, Neighbor, matches('Neighbor[0-9]+'), na.rm = T) %>%
left_join(deaths, by = c(Neighbor = 'CountryCode', 'Year'), suffix = c('', '_neighbor')) %>%
group_by(Country, Year, CountryCode, TerrorismDeaths) %>%
summarise(NeighborAvgTerror= mean(TerrorismDeaths_neighbor, na.rm = T)) %>%
arrange(Year, CountryCode) %>%
left_join(neighbors, by = c('CountryCode', 'Year'))
# # A tibble: 10 x 7
# # Groups: Country, Year, CountryCode [?]
# Country Year CountryCode TerrorismDeaths NeighborAvgTerror Neighbor1 Neighbor2
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 US 2000 2 5 10 40 NA
# 2 Cuba 2000 40 10 5 2 NA
# 3 Belgium 2000 211 8 7 220 255
# 4 France 2000 220 11 5.5 211 255
# 5 Germany 2000 255 3 9.5 211 220
# 6 US 2001 2 1 7 40 NA
# 7 Cuba 2001 40 7 1 2 NA
# 8 Belgium 2001 211 9 10.5 220 255
# 9 France 2001 220 15 7.5 211 255
# 10 Germany 2001 255 6 12 211 220
如果您需要首先建立一个邻近国家/邻国的数据集,则可以使用CoW Direct Contiguity dataset。
library(curl)
library(readr)
library(dplyr)
library(tidyr)
url <- 'http://www.correlatesofwar.org/data-sets/direct-contiguity/direct-contiguity-v3-2'
tmp <- tempfile()
curl_download(url, tmp)
csv_file <- unzip(tmp, files = 'DirectContiguity320/contdird.csv', exdir = tempdir())
unlink(tmp)
# level of separation options
# 1: Separated by a land or river border
# 2: Separated by 12 miles of water or less
# 3: Separated by 24 miles of water or less (but more than 12 miles)
# 4: Separated by 150 miles of water or less (but more than 24 miles)
# 5: Separated by 400 miles of water or less (but more than 150 miles)
neighbors <-
read_csv(csv_file, col_types = 'iiciciid') %>%
filter(year >= 2000) %>% # filter to only years you need
filter(conttype <= 4) %>% # choose level of separation
select(CountryCode = state1no, Year = year, neighbor = state2no) %>%
group_by(CountryCode, Year) %>%
mutate(Neighbor = row_number()) %>%
spread(Neighbor, neighbor, sep = '')
unlink(csv_file)
deaths <- Example %>% select(CountryCode, Year, TerrorismDeaths)
Example %>%
select(-Neighbor1, -Neighbor2) %>%
left_join(neighbors, by = c("Year", "CountryCode")) %>%
gather(Neighbor_num, Neighbor, matches('Neighbor[0-9]+'), na.rm = T) %>%
left_join(deaths, by = c(Neighbor = 'CountryCode', 'Year'), suffix = c('', '_neighbor')) %>%
group_by(Country, Year, CountryCode, TerrorismDeaths) %>%
summarise(NeighborAvgTerror = mean(TerrorismDeaths_neighbor, na.rm = T)) %>%
arrange(Year, Country) %>%
left_join(neighbors, by = c('CountryCode', 'Year'))