重新创建选举目标计划

时间:2018-08-02 20:34:11

标签: r plyr pscl

我真的很喜欢政治和选举,只是想学习R,我想用当地县的新数据重新创建blog中提出的流程。我可以使用修改后的代码可靠地完成博客中的大部分流程,直到进行区域分析为止。

datas <- district.analyze(data)

作者分析了特定的房屋区域,而我更希望分析整个县。我修改了代码,以将美国众议院作为我的目标地区,因为它涵盖了整个县。

我想知道是否有人对为什么我无法从该县数据中获取区域级别摘要提出建议。我收到这样的错误:

> Error in aggregate.data.frame(as.data.frame(x), ...) : 
no rows to aggregate
In addition: Warning message:
In min(adf[, "rep_turnout_pct"], na.rm = TRUE) :

只有在数据中包含“ NA”时,才会出现此错误。当我用“ 0”代替空格时,district.analyze起作用了,但是“ 0”使所有等式失效。

我可以重现的最小代码量是:

library(plyr)

 major.party.bias <- function(adf) {    

 # aggregate base partisan vote -  lowest non-zero turnout by party, given any election

 abpv_rep <- min(adf[adf$rep_turnout_pct,"rep_turnout_pct"],na.rm=TRUE)
abpv_dem <- min(adf[adf$dem_turnout_pct,"dem_turnout_pct"],na.rm=TRUE)

 # aggregate base partisan is combination of major parties worst scores
 base_abpv = abpv_rep + abpv_dem
 # swing is what is left after the aggregate base partisan support is removed
 abpv_swing = 1.0 - base_abpv

 # remove elections w/ no contender ie NA rep or NA dem turnout
 tsa <- adf[which(!is.na(adf$dem_turnout) & !is.na(adf$rep_turnout)),]  
 # add a abs difference of rep v dem column
 tsa[,"spread"] <- abs(tsa$dem_turnout_pct - tsa$rep_turnout_pct)

 # average party performance - average of the top 3 best matched races (sorted by abs(rep-dem) performance)
 app_dem <- mean(tsa[order(tsa$spread)[1:3],]$dem_turnout_pct)
 app_rep <- mean(tsa[order(tsa$spread)[1:3],]$rep_turnout_pct)

 # aggreage soft partisan vote - difference between the average worst over each year and the absolute worst (aggregate base partisan vote)
 tsa <- adf[which(!is.na(adf$rep_turnout)),]
 abpv_rep_soft <- mean(aggregate(tsa$rep_turnout_pct,tsa["year"],min)[,"x"]) - abpv_rep
 tsa <- adf[which(!is.na(adf$dem_turnout)),]
 abpv_dem_soft <- mean(aggregate(tsa$dem_turnout_pct,tsa["year"],min)[,"x"]) - abpv_dem

 # tossup is everything left after we take out base and soft support for both major parties
 abpv_tossup = abs(1.0 - abpv_rep_soft - abpv_rep - abpv_dem_soft - abpv_dem)

 partisan.rep <- abpv_rep + abpv_rep_soft
 partisan.dem <- abpv_dem + abpv_dem_soft

 return (data.frame(partisan.base=base_abpv,partisan.swing=abpv_swing,tossup=abpv_tossup,
                    app.rep=app_rep,base.rep=abpv_rep,soft.rep=abpv_rep_soft,app.dem=app_dem,base.dem=abpv_dem,soft.dem=abpv_dem_soft,
                    partisan.rep=partisan.rep, partisan.dem=partisan.dem)) 
}


 project.turnout <- function(adf,years=c(2012,2014,2016),target.district.type="U.S. House",similar.district.types=c('U.S. Senate','State Senate', 'State Auditor', 'Governor'),top.ballot.district.type="U.S. Senate") {
 # look for good elections in years
 case.type = 0
 gl <- adf[which(adf$year %in% years & adf$district_type == target.district.type & !is.na(adf$dem_turnout) & !is.na(adf$rep_turnout)),] 
 # case 1 - major parties ran in 2001,2005 (governor + lt governor + HD)
 # we'll calculate the average_turnout x downballot_turnout
 proj.turnout <- 0.0
 if(nrow(gl) >= 2 ){
     down.ballot.turnout <- mean((gl$dem_turnout + gl$rep_turnout) / gl$total_registration)     
     gl <- adf[which(adf$year %in% years & adf$district_type == top.ballot.district.type),]             
     top.ticket.turnout <- mean(gl$total_turnout / gl$total_registration)
     gl <- adf[which(adf$year %in% years & !is.na(adf$dem_turnout) & !is.na(adf$rep_turnout)),]             
     avg.turnout <- mean((gl$dem_turnout + gl$rep_turnout) / gl$total_registration) 
     runoff <- down.ballot.turnout / top.ticket.turnout
     proj.turnout <- runoff * avg.turnout
     case.type = 1
 }  
 # case 2 - missing major party candidate in ''years'', so we 'll just take the average of what we've got walking backwards from the last known good year
 # need more than one HD election
 else {     
     gl <- adf[which(adf$district_type == target.district.type & !is.na(adf$dem_turnout) & !is.na(adf$rep_turnout)),]   
     if(nrow(gl) >= 1 ) {
         # calculate the average turnout of at least one election
        proj.turnout <- mean((gl$dem_turnout + gl$rep_turnout) / gl$total_registration)                     
         case.type = 2
     }
     else {
         # we dont have any evenly matched house races so we'll look at ''similar.district.types'' as a substitute
         gl <- adf[which((adf$district_type %in% similar.district.types) & !is.na(adf$dem_turnout) & !is.na(adf$rep_turnout)),] 
         if(nrow(gl) >= 1) {
             proj.turnout <- mean((gl$dem_turnout + gl$rep_turnout) / gl$total_registration)                                        
             case.type = 3
         }
         else {
             proj.turnout <- 0          
             case.type = 4
         }
     }
 }
 # project the actual registration based on the known last registration in the df
 reg <- adf[1,]$last_registration
 proj.turnout.count <- proj.turnout  * reg return(data.frame(proj.turnout.percent=proj.turnout,proj.turnout.count=proj.turnout.count,current.reg=reg,case.type=case.type))
}

  # apply the major party bias to the projected turnout 
apply.turnout <- function(adf) {
   # take proj.turnout.count (from project.turnout) and combine it 
 with partisan percentages from major.party.bias


  adf$proj.turnout.dem <- floor(adf$proj.turnout.count * adf$app.dem)
  adf$proj.turnout.rep <- floor(adf$proj.turnout.count * adf$app.rep)
  adf$votes.to.win <- floor(adf$proj.turnout.count/2)+1

 return(adf)
 }


 district.analyze <- function(dis) {
ret <- ddply(dis, .(precinct_name), function(x) merge(project.turnout(x),major.party.bias(x)))
ret <- apply.turnout(ret)
return(ret)
 }

我的数据是我从.csv读入R的大型数据集:

## Data given as Google Sheets
library(gsheet)
url <-"https://drive.google.com/file/d/1E4P0rfDVWEepbGHwX58qNSWN5vWd3iQU/view?usp=sharing"
df <- gsheet2tbl(url)

1 个答案:

答案 0 :(得分:0)

由于URL容易腐烂,所以最好像@Steady所指出的那样,做一个最小的可复制示例。不过,我将与您所给我们的合作。

我首先从GitHub导入您的代码:

## Matt's code from GitHub
library(RCurl)
script <-
  getURL(
    "https://raw.githubusercontent.com/mwtxw2/R-Aggpol-Boone-Test/master/R%20Data",
    ssl.verifypeer = FALSE
  )
eval(parse(text = script))

然后我读了您在Google表格中提供的数据

## Data given as Google Sheets
library(gsheet)
url <-
  "https://docs.google.com/spreadsheets/d/1HJjLDFEiLixQZLeMtliXyojsPtWh6XM9AZjJbc-96tA/edit?usp=sharing"
df <- gsheet2tbl(url)

现在,我认为问题是,如果美国参议院子集被称为-则没有区号-它们是NA值。

## There are no district numbers if subsetted to U.S. Senate.
table((df %>% dplyr::filter(district_type == "U.S. Senate"))$district_number)
sum(!is.na((df %>% dplyr::filter(district_type == "U.S. Senate"))$district_number))

## Summary function edited
historical.turnout.summary <- function(adf,
                                       district.type = "U.S. Senate",
                                       district.number = NULL,
                                       years = c(2012, 2014, 2016)) {
    s <-
      adf[which(
        adf$district_type == district.type &
          # adf$district_number == district.number &
          adf$year %in% years
      ), ]
    if (!is.null(district.number)) {
      s <- 
        s[which(s$district_number == district.number)]
    }
    df <- ddply(
      s, "year",
      function(x) {
        year <- x$year[1]
        total.turnout <- sum(x$total_turnout)
        total.registration <- sum(x$total_registration)
        return(
          data.frame(
            year = year,
            total.turnout = total.turnout,
            total.registration = total.registration
          )
        )
      }
    )
    return(df)
  }

## Re-run
historical.turnout.summary(
  df, 
  district.type = "U.S. Senate", 
  years = c(2012, 2014, 2016) 
)

但是我不确定100%,因为您可能已经从本地文件中读取了数据,并且虽然从Google表格上载/下载了数据,但空白可能已变成NA。

使用district.analyze时,调用major.party.bias时会出错,因为您尝试对诸如"21.39%""62.81%"之类的表达式进行数值运算。必须将它们解析并转换为数字。

让我知道这是否不是您想要的。