R(双变量LISA)中的二元空间相关性图

时间:2017-07-18 21:42:06

标签: r geospatial spatial spdep

我想创建一个地图,显示两个变量之间的双变量空间相关性。这可以通过进行双变量Moran的I空间相关的LISA图或使用Lee (2001)提出的L指数来完成。

双变量Moran的I未在spdep库中实现,但是L索引是,所以这里是我使用L索引未成功尝试的。显示基于Moran的解决方案的答案我也非常欢迎!

从下面的可重复示例中可以看出,到目前为止,我已经计算了本地L指数。 我想做什么是估算伪p值和create a map of the results like those maps we use in LISA spatial clusters with high-high, high-low, ..., low-low

在此示例中,目标是创建一个在黑人和白人群体之间具有双变量Lisa关联的地图。应在ggplot2中创建地图,显示群集:

  • 黑人和高人白人的存在
  • 黑人和低白人的存在
  • 黑人和高白人的存在
  • 黑人和低级白人的存在率低

可重复的例子

library(UScensus2000tract)
library(ggplot2)
library(spdep)
library(sf)

# load data
  data("oregon.tract")

# plot Census Tract map
  plot(oregon.tract)


# Variables to use in the correlation: white and black population in each census track
  x <- scale(oregon.tract$white)
  y <- scale(oregon.tract$black)


# create  Queen contiguity matrix and Spatial weights matrix
  nb <- poly2nb(oregon.tract)
  lw <- nb2listw(nb)


# Lee index
  Lxy <-lee(x, y, lw, length(x), zero.policy=TRUE)

  # Lee’s L statistic (Global)
    Lxy[1]
    #> -0.1865688811


# 10k permutations to estimate pseudo p-values
  LMCxy <- lee.mc(x, y, nsim=10000, lw, zero.policy=TRUE, alternative="less")

# quik plot of local L
  Lxy[[2]] %>% density() %>% plot() # Lee’s local L statistic  (Local)
  LMCxy[[7]] %>% density() %>% lines(col="red") # plot values simulated 10k times 


# get confidence interval of 95% ( mean +- 2 standard deviations)
  two_sd_above <- mean(LMCxy[[7]]) + 2 * sd(LMCxy[[7]])
  two_sd_below <- mean(LMCxy[[7]]) - 2 * sd(LMCxy[[7]])


# convert spatial object to sf class for easier/faster use
  oregon_sf <- st_as_sf(oregon.tract)


# add L index values to map object
  oregon_sf$Lindex <- Lxy[[2]]

# identify significant local results  
  oregon_sf$sig <- if_else( oregon_sf$Lindex < 2*two_sd_below, 1, if_else( oregon_sf$Lindex > 2*two_sd_above, 1, 0))


# Map of Local L index but only the significant results
  ggplot() + geom_sf(data=oregon_sf, aes(fill=ifelse( sig==T, Lindex, NA)), color=NA)

enter image description here

3 个答案:

答案 0 :(得分:3)

这个怎么样?

我使用常规的Moran's I而不是你建议的Lee Index。但我认为潜在的推理几乎是一样的。

正如您可能会看到的那样 - 以这种方式产生的结果看起来与来自GeoDA的结果非常相似

library(dplyr)
library(ggplot2)
library(sf)
library(spdep)
library(rgdal)
library(stringr)
library(UScensus2000tract)

#======================================================
# load data
data("oregon.tract")

# Variables to use in the correlation: white and black population in each census track
x <- oregon.tract$white
y <- oregon.tract$black

#======================================================
# Programming some functions

# Bivariate Moran's I
moran_I <- function(x, y = NULL, W){
        if(is.null(y)) y = x

        xp <- (x - mean(x, na.rm=T))/sd(x, na.rm=T)
        yp <- (y - mean(y, na.rm=T))/sd(y, na.rm=T)
        W[which(is.na(W))] <- 0
        n <- nrow(W)

        global <- (xp%*%W%*%yp)/(n - 1)
        local  <- (xp*W%*%yp)

        list(global = global, local  = as.numeric(local))
}


# Permutations for the Bivariate Moran's I
simula_moran <- function(x, y = NULL, W, nsims = 1000){

        if(is.null(y)) y = x

        n   = nrow(W)
        IDs = 1:n

        xp <- (x - mean(x, na.rm=T))/sd(x, na.rm=T)
        W[which(is.na(W))] <- 0

        global_sims = NULL
        local_sims  = matrix(NA, nrow = n, ncol=nsims)

        ID_sample = sample(IDs, size = n*nsims, replace = T)

        y_s = y[ID_sample]
        y_s = matrix(y_s, nrow = n, ncol = nsims)
        y_s <- (y_s - apply(y_s, 1, mean))/apply(y_s, 1, sd)

        global_sims  <- as.numeric( (xp%*%W%*%y_s)/(n - 1) )
        local_sims  <- (xp*W%*%y_s)

        list(global_sims = global_sims,
             local_sims  = local_sims)
}


#======================================================
# Adjacency Matrix (Queen)

nb <- poly2nb(oregon.tract)
lw <- nb2listw(nb, style = "B", zero.policy = T)
W  <- as(lw, "symmetricMatrix")
W  <- as.matrix(W/rowSums(W))
W[which(is.na(W))] <- 0

#======================================================
# Calculating the index and its simulated distribution
# for global and local values

m   <- moran_I(x, y, W)
m[[1]] # global value

m_i <- m[[2]]  # local values

local_sims <- simula_moran(x, y, W)$local_sims

# Identifying the significant values 
alpha <- .05  # for a 95% confidence interval
probs <- c(alpha/2, 1-alpha/2)
intervals <- t( apply(local_sims, 1, function(x) quantile(x, probs=probs)))
sig        <- ( m_i < intervals[,1] )  | ( m_i > intervals[,2] )

#======================================================
# Preparing for plotting

oregon.tract     <- st_as_sf(oregon.tract)
oregon.tract$sig <- sig


# Identifying the LISA patterns
xp <- (x-mean(x))/sd(x)
yp <- (y-mean(y))/sd(y)

patterns <- as.character( interaction(xp > 0, W%*%yp > 0) ) 
patterns <- patterns %>% 
        str_replace_all("TRUE","High") %>% 
        str_replace_all("FALSE","Low")
patterns[oregon.tract$sig==0] <- "Not significant"
oregon.tract$patterns <- patterns


# Plotting
ggplot() + geom_sf(data=oregon.tract, aes(fill=patterns), color="NA") +
        scale_fill_manual(values = c("red", "pink", "light blue", "dark blue", "grey95")) + 
        theme_minimal()

通过更改置信区间(例如,使用90%而不是95%),可以使结果与GeoDa的结果更接近(但不相同)。

我认为剩下的差异来自于计算Moran的I略有不同的方法。我的版本给出了moran包中可用的函数spdep的相同值。但是GeoDa可能会使用另一种方法。

GeoDA enter image description here

答案 1 :(得分:0)

我认为添加到线程中已经很晚了,但是Lee's L与您在这里所做的完全不同,这是Wartenberg(1985)的创新。这具有一些潜在的缺点。主要是,它测试了 x 与y的 滞后时间 之间的关系,如@RogerioJB所解释的,通过将模拟y乘以邻接矩阵来计算空间滞后y。 Lee(2001)的创新完全不同,涉及将Pearson's r和空间平滑标量(SSS)集成在一起,而是比较了 x y ,而不是 y的滞后时间 。通过从lee.mc函数生成可能的局部l的分布,可以复制采用的@RogerioJB方法。反过来,可以按照类似于GeoDa的高-高...低-低重要性聚类图的样式来绘制结果。

答案 2 :(得分:0)

基于@ justin-k的建议,我通过@rogeriojb修改了双变量LISA代码以计算Lee的L统计量。这种方法从lee.mc()包中创建了一个经过修改的spdep函数,以模拟本地L值。我在带有点级数据的GitHub gist中提供了另一个示例。

library(boot)
library(dplyr)
library(ggplot2)
library(sf)
library(spdep)
library(rgdal)
library(stringr)
library(UScensus2000tract)

#======================================================
# load data
data("oregon.tract")

# Variables to use in the correlation: white and black population in each census track
x <- oregon.tract$white
y <- oregon.tract$black

# ----------------------------------------------------- #
# Program a function
## Permutations for Lee's L statistic
## Modification of the lee.mc() function within the {spdep} package
## Saves 'localL' output instead of 'L' output
simula_lee <- function(x, y, listw, nsim = nsim, zero.policy = NULL, na.action = na.fail) {
  
  if (deparse(substitute(na.action)) == "na.pass") 
    stop ("na.pass not permitted")
  na.act <- attr(na.action(cbind(x, y)), "na.action")
  x[na.act] <- NA
  y[na.act] <- NA
  x <- na.action(x)
  y <- na.action(y)
  if (!is.null(na.act)) {
    subset <- !(1:length(listw$neighbours) %in% na.act)
    listw <- subset(listw, subset, zero.policy = zero.policy)
  }
  n <- length(listw$neighbours)
  if ((n != length(x)) | (n != length(y))) 
    stop ("objects of different length")
  gamres <- suppressWarnings(nsim > gamma(n + 1))
  if (gamres) 
    stop ("nsim too large for this number of observations")
  if (nsim < 1) 
    stop ("nsim too small")
  xy <- data.frame(x, y)
  S2 <- sum((unlist(lapply(listw$weights, sum)))^2)
  
  lee_boot <- function(var, i, ...) {
    return(lee(x = var[i, 1], y = var[i, 2], ...)$localL)
  }
  
  res <- boot(xy, statistic = lee_boot, R = nsim, sim = "permutation", 
              listw = listw, n = n, S2 = S2, zero.policy = zero.policy)
}

# ----------------------------------------------------- #
# Adjacency Matrix
nb <- poly2nb(oregon.tract)
lw <- nb2listw(nb, style = "B", zero.policy = T)
W  <- as(lw, "symmetricMatrix")
W  <- as.matrix(W / rowSums(W))
W[which(is.na(W))] <- 0

# ----------------------------------------------------- #
# Calculate the index and its simulated distribution
# for global and local values

# Global Lee's L
lee.test(x = x, y = y, listw = lw, zero.policy = TRUE,
         alternative = "two.sided", na.action = na.omit)

# Local Lee's L values
m <- lee(x = x, y = y, listw = lw, n = length(x), 
         zero.policy = TRUE, NAOK = TRUE)

# Local Lee's L simulations
local_sims <- simula_lee(x = x, y = y, listw = lw, nsim = 10000,
                         zero.policy = TRUE, na.action = na.omit)

m_i <- m[[2]]  # local values

# Identify the significant values 
alpha <- 0.05  # for a 95% confidence interval
probs <- c(alpha/2, 1-alpha/2)
intervals <- t(apply(t(local_sims[[2]]), 1, function(x) quantile(x, probs = probs)))
sig <- (m_i < intervals[ , 1] ) | ( m_i > intervals[ , 2])

#======================================================
# Preparing for plotting
oregon.tract <- st_as_sf(oregon.tract)
oregon.tract$sig <- sig

# Identifying the Lee's L patterns
xp <- scale(x)
yp <- scale(y)

patterns <- as.character(interaction(xp > 0, W%*%yp > 0)) 
patterns <- patterns %>% 
  str_replace_all("TRUE","High") %>% 
  str_replace_all("FALSE","Low")
patterns[oregon.tract$sig == 0] <- "Not significant"
oregon.tract$patterns <- patterns

# Plotting
ggplot() +
  geom_sf(data = oregon.tract, aes(fill = patterns), color = "NA") +
  scale_fill_manual(values = c("red", "pink", "light blue", "dark blue", "grey95")) + 
  guides(fill = guide_legend(title = "Lee's L clusters")) +
  theme_minimal()

Lee's L clusters for oregon.tract data