Question

我正在使用'股票模型'来估算缺失观察值。使用示例数据集my.data，我在三年中每年填写缺失的观测值，与1970年观测资料的分配方式成比例（尽管我可以使用2010年或1970年和2010年这样做）。

下面我提供示例数据，以两种方式获得所需估计的期望结果和代码。第一种方法的代码非常特定于此示例。我希望创建一个比第二种方法更常用的功能。在我看来，创建一个更通用的函数需要在列表列表上调用函数。我希望有人可以提供有关如何将函数应用于列表列表的建议。

以下是示例数据集和高度特定的解决方案：

my.data <- read.table(text = '
 county  y1970  y1980  y1990  y2000  y2010
   aa      50     NA     70     NA     500
   cc      10     20     NA     NA     100
   ee     800     NA     NA    400    8000
   gg    1000   1900     NA     NA   10000
   ii     200    400    300    100    2000
   kk      20     40     30     NA     200
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)

my.total <- read.table(text = '
   county  y1970  y1980  y1990  y2000  y2010
   total    2080   4000   3000   1000  20800
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)

desired.result <- read.table(text = '
 county  y1970  y1980         y1990       y2000       y2010
   aa      50  96.47059         70      23.148148       500
   cc      10     20         14.36464    4.629630       100
   ee     800   1543.529   1149.17127      400         8000
   gg    1000   1900       1436.46409   462.962963    10000
   ii     200    400           300         100         2000
   kk      20     40            30       9.259259       200
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)

x70 <- c(50, 800)
estimates.for.80 <- (x70 / sum(x70)) * (my.total$y1980 - sum(my.data$y1980, na.rm = TRUE))

x80 <- c(10, 800, 1000)
estimates.for.90 <- (x80 / sum(x80)) * (my.total$y1990 - sum(my.data$y1990, na.rm = TRUE))

x90 <- c(50, 10, 1000, 20)
estimates.for.00 <- (x90 / sum(x90)) * (my.total$y2000 - sum(my.data$y2000, na.rm = TRUE))

这是功能。如果我知道如何将d.counties作为输入列表包含在函数中，我认为这可以推广。换句话说，如何在d.counties中加入my.input并仍然可以使用该功能？我认为我的困惑源于d.counties长度不同的年份。

state <- 'my.state'

my.df <- read.table(text = '
  county   y1970  y1980  y1990  y2000   y2010
      aa      50     NA     70     NA     500
      cc      10     20     NA     NA     100
      ee     800     NA     NA    400    8000
      gg    1000   1900     NA     NA   10000
      ii     200    400    300    100    2000
      kk      20     40     30     NA     200
   total    2080   4000   3000   1000   20800
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)

pre.divide.up <- tail(my.df[,2:ncol(my.df)], 1) - colSums(head(my.df[,2:ncol(my.df)], -1), na.rm = TRUE)

# For each column containing NA's define the years to use as shares
# If use.years = 'pre'  then use the year in pre.year
# If use.years = 'post' then use the year in post.year
# If use.years = 'both' then use both the year in pre.year and the year in post.year
#
# Here I define pre.year = y1970 and post.year = 2010 for every year
# However, 'pre.year' and 'post.year' are variables.  They can differ among rows below.
shares <- read.table(text = '
      cyear   pre.year  post.year  use.years
      y1980     y1970     y2010        pre
      y1990     y1970     y2010        pre
      y2000     y1970     y2010        pre
', header = TRUE, na.strings = "NA")

d.counties.80 <- c( 'aa' ,
                    'ee' )

d.counties.90 <- c( 'cc' ,
                    'ee' , 
                    'gg' )

d.counties.00 <- c( 'aa' ,
                    'cc' ,
                    'gg' ,
                    'kk' )

d.counties <- list(d.counties.80, d.counties.90, d.counties.00)

my.input <- data.frame(shares)

my.function <- function(y) {

# extract years of interest from my.df and store in data.frame called year.data
if(y[[4]] != 'last') year.data = my.df[names(my.df) %in% c("county", y[[2]], y[[1]], y[[3]])]
if(y[[4]] == 'last') year.data = my.df[names(my.df) %in% c("county", y[[2]], y[[1]]        )]

# subset counties in year.data to only include counties with NA's in current year
if(as.numeric(substr(y[1], 2, 5)) == 1980) year.data = year.data[year.data$county %in% d.counties.80,]
if(as.numeric(substr(y[1], 2, 5)) == 1990) year.data = year.data[year.data$county %in% d.counties.90,]
if(as.numeric(substr(y[1], 2, 5)) == 2000) year.data = year.data[year.data$county %in% d.counties.00,]

# reorder columns in year.data
if(y[[4]] != 'last') year.data = year.data[, c('county', y[[2]], y[[1]], y[[3]])]
if(y[[4]] == 'last') year.data = year.data[, c('county', y[[2]], y[[1]]        )]

# values to be divided, or distributed, among counties with NA's in the current year
divide.up <- pre.divide.up[, y[[1]]] 

# sum values from designated pre and/or post years and bind those totals to bottom of year.data
if(y[[4]] != 'last') colsums.year = data.frame('total', as.data.frame(t(as.numeric(colSums(year.data[,c(2:4)], na.rm=TRUE)))))
if(y[[4]] == 'last') colsums.year = data.frame('total', as.data.frame(t(as.numeric(colSums(year.data[,c(2:3)], na.rm=TRUE)))))
names(colsums.year) <- names(year.data)
year.data.b <- rbind(year.data, colsums.year)

# obtain percentages in designated pre and/or post years for counties with NA's in current year
year.data.c <- year.data.b
year.data.c[, -1] <- lapply( year.data.c[  , -1], function(x){ x/x[nrow(year.data.b)] } )

# estimate county values for current year by distributing total missing values in current year
# according to how values were distributed in those same counties in other years
if(y[[4]] == 'both') year.data.b[, y[[1]]] = rowMeans(data.frame(year.data.c[, y[[2]]], year.data.c[, y[[3]]])) * as.numeric(divide.up)
if(y[[4]] ==  'pre') year.data.b[, y[[1]]] = year.data.c[, y[[2]]] * as.numeric(divide.up)
if(y[[4]] == 'post') year.data.b[, y[[1]]] = year.data.c[, y[[3]]] * as.numeric(divide.up)
if(y[[4]] == 'last') year.data.b[, y[[1]]] = year.data.c[, y[[2]]] * as.numeric(divide.up)

# extract estimates for current year along with the county column, then remove the last row
year.data.last <- year.data.b[names(year.data.b) %in% c("county", y[[1]])]
year.data.last <- year.data.last[-nrow(year.data.last),]
colnames(year.data.last) <- c('county', 'acreage')

# create a data set for export
this.year <- rep(as.numeric(substr(y[[1]], 2, 5)), nrow(year.data.last))
revised.data <- data.frame(state, this.year, year.data.last)
return(revised.data) 
}

my.list  <- apply(shares, 1, function(y) my.function(y))
my.list2 <- do.call("rbind", my.list)
my.list2

      state this.year county     acreage
1  my.state      1980     aa   96.470588
3  my.state      1980     ee 1543.529412
2  my.state      1990     cc   14.364641
31 my.state      1990     ee 1149.171271
4  my.state      1990     gg 1436.464088
11 my.state      2000     aa   23.148148
21 my.state      2000     cc    4.629630
41 my.state      2000     gg  462.962963
6  my.state      2000     kk    9.259259

虽然这个功能不像我下面的答案那样通用，但上面的功能确实允许明确指定哪些县有相关的缺失值。在实际数据中有两种类型的缺失值，下面我的答案中的函数不能区分两种类型。上面的功能可以区分它们，因为我告诉它每年要考虑哪些县。

再次感谢您提出任何建议和建议。

Answer 1

我认为你的整个问题可以归纳为几行。这太长了。如果您的问题确实如标题所示，将函数应用于列表列表，那么您需要递归地应用函数。有一个用于执行此操作的构造，它是rapply：

w <- 1:5
x <- 1:5
y <- 6:10
z <- 6:10

ll <- list( list( w , x) , list( y , z) )
str(ll)
List of 2
 $ :List of 2
  ..$ : int [1:5] 1 2 3 4 5
  ..$ : int [1:5] 1 2 3 4 5
 $ :List of 2
  ..$ : int [1:5] 6 7 8 9 10
  ..$ : int [1:5] 6 7 8 9 10

rapply( ll , mean )
[1] 3 3 8 8

作为一个建议，基本上你可以把问题归结为......

我有这个列表列表，但是当我尝试使用lapply时它不起作用...

lapply( ll , mean )
[[1]]
[1] NA

[[2]]
[1] NA

Warning messages:
1: In mean.default(X[[1L]], ...) :
  argument is not numeric or logical: returning NA
2: In mean.default(X[[2L]], ...) :
  argument is not numeric or logical: returning NA

Answer 2

这个问题搞砸了。我会首先尝试解释你的问题然后回答，以便更容易弄清楚我是否理解你的问题。

问题的要点：

好的，在根据您的问题筛选出所需内容后，我发现您有一个data.frame my.data：

  county y1970 y1980 y1990 y2000 y2010
1     aa    50    NA    70    NA   500
2     cc    10    20    NA    NA   100
3     ee   800    NA    NA   400  8000
4     gg  1000  1900    NA    NA 10000
5     ii   200   400   300   100  2000
6     kk    20    40    30    NA   200

另一个my.total：

  county y1970 y1980 y1990 y2000 y2010
1  total  2080  4000  3000  1000 20800

你想要的是（desired.result）：

  county y1970      y1980      y1990      y2000 y2010
1     aa    50   96.47059   70.00000  23.148148   500
2     cc    10   20.00000   14.36464   4.629630   100
3     ee   800 1543.52900 1149.17127 400.000000  8000
4     gg  1000 1900.00000 1436.46409 462.962963 10000
5     ii   200  400.00000  300.00000 100.000000  2000
6     kk    20   40.00000   30.00000   9.259259   200

标准：

我理解你的标准的方法是，对于my.data中包含NA的每个数字/整数列，取第1970列对应于那些NA条目的条目，并用公式替换那些NA条目： / p>

vals <- corresponding entries in column 1970
this column's NA's <- vals/sum(vals) * (my.total of this column - 
                    sum(this.column, na.rm=TRUE))

可能的解决方案：

我认为这里不需要“列表清单”。这是使用简单for循环的一种方式（因为它在概念上更容易解决这个问题）。这是因为您要修改data.frame的某些列中的某些项目。 df将是desired.result。

df <- my.data
for (i in which(colSums(is.na(df), na.rm=TRUE) > 0)) {
    idx <- which(is.na(df[[i]]))
    xx <- df[["y1970"]][idx]
    df[[i]][idx] <- (xx/sum(xx)) * (my.total[[i]] - sum(df[[i]], na.rm=TRUE))
}

  county y1970      y1980      y1990      y2000 y2010
1     aa    50   96.47059   70.00000  23.148148   500
2     cc    10   20.00000   14.36464   4.629630   100
3     ee   800 1543.52941 1149.17127 400.000000  8000
4     gg  1000 1900.00000 1436.46409 462.962963 10000
5     ii   200  400.00000  300.00000 100.000000  2000
6     kk    20   40.00000   30.00000   9.259259   200

Answer 3

在我的原始帖子中，我询问如何将列表d.counties作为输入包含在函数中，而不必使用一系列特定的if语句。这是我提出的解决方案。

步骤1.创建列表d.counties，以便保留名称：

d.counties.1980 <- c( 'aa' ,
                      'ee' )

d.counties.1990 <- c( 'cc' ,
                      'ee' , 
                      'gg' )

d.counties.2000 <- c( 'aa' ,
                      'cc' ,
                      'gg' ,
                      'kk' )

list.function <-  function() { 

     sapply(c("d.counties.1980",
              "d.counties.1990",
              "d.counties.2000"), get, environment(), simplify = FALSE) 
} 

d.counties <- list.function()

步骤2.在函数内部替换一系列if语句，这些语句明确指定各个年份的缺失观察值，使用以下通用名称访问列表d.counties而不明确指定个别名称个人年份：

year.data = year.data[year.data$county %in% d.counties[substr(names(d.counties), 12, 15) == substr(y[1], 2, 5)][[1]],]

以下是此解决方案的完整代码：

state <- 'my.state'

my.df <- read.table(text = '
  county   y1970  y1980  y1990  y2000   y2010
      aa      50     NA     70     NA     500
      cc      10     20     NA     NA     100
      ee     800     NA     NA    400    8000
      gg    1000   1900     NA     NA   10000
      ii     200    400    300    100    2000
      kk      20     40     30     NA     200
   total    2080   4000   3000   1000   20800
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)

pre.divide.up <- tail(my.df[,2:ncol(my.df)], 1) - colSums(head(my.df[,2:ncol(my.df)], -1), na.rm = TRUE)

# For each column containing NA's define the years to use as shares
# If use.years = 'pre'  then use the year in pre.year
# If use.years = 'post' then use the year in post.year
# If use.years = 'both' then use both the year in pre.year and the year in post.year
#
# Here I define pre.year = y1970 and post.year = 2010 for every year
# However, 'pre.year' and 'post.year' are variables.  They can differ among rows below.
shares <- read.table(text = '
      cyear   pre.year  post.year  use.years
      y1980     y1970     y2010        pre
      y1990     y1970     y2010        pre
      y2000     y1970     y2010        pre
', header = TRUE, na.strings = "NA")

d.counties.1980 <- c( 'aa' ,
                      'ee' )

d.counties.1990 <- c( 'cc' ,
                      'ee' , 
                      'gg' )

d.counties.2000 <- c( 'aa' ,
                      'cc' ,
                      'gg' ,
                      'kk' )

list.function <-  function() { 

     sapply(c("d.counties.1980",
              "d.counties.1990",
              "d.counties.2000"), get, environment(), simplify = FALSE) 
} 

d.counties <- list.function()
d.counties

my.input <- data.frame(shares)

my.function <- function(y) {

# extract years of interest from my.df and store in data.frame called year.data
if(y[[4]] != 'last') year.data = my.df[names(my.df) %in% c("county", y[[2]], y[[1]], y[[3]])]
if(y[[4]] == 'last') year.data = my.df[names(my.df) %in% c("county", y[[2]], y[[1]]        )]

# subset counties in year.data to only include counties with NA's in current year
year.data = year.data[year.data$county %in% d.counties[substr(names(d.counties), 12, 15) == substr(y[1], 2, 5)][[1]],]

# reorder columns in year.data
if(y[[4]] != 'last') year.data = year.data[, c('county', y[[2]], y[[1]], y[[3]])]
if(y[[4]] == 'last') year.data = year.data[, c('county', y[[2]], y[[1]]        )]

# values to be divided, or distributed, among counties with NA's in the current year
divide.up <- pre.divide.up[, y[[1]]] 

# sum values from designated pre and/or post years and bind those totals to bottom of year.data
if(y[[4]] != 'last') colsums.year = data.frame('total', as.data.frame(t(as.numeric(colSums(year.data[,c(2:4)], na.rm=TRUE)))))
if(y[[4]] == 'last') colsums.year = data.frame('total', as.data.frame(t(as.numeric(colSums(year.data[,c(2:3)], na.rm=TRUE)))))
names(colsums.year) <- names(year.data)
year.data.b <- rbind(year.data, colsums.year)

# obtain percentages in designated pre and/or post years for counties with NA's in current year
year.data.c <- year.data.b
year.data.c[, -1] <- lapply( year.data.c[  , -1], function(x){ x/x[nrow(year.data.b)] } )

# estimate county values for current year by distributing total missing values in current year
# according to how values were distributed in those same counties in other years
if(y[[4]] == 'both') year.data.b[, y[[1]]] = rowMeans(data.frame(year.data.c[, y[[2]]], year.data.c[, y[[3]]])) * as.numeric(divide.up)
if(y[[4]] ==  'pre') year.data.b[, y[[1]]] = year.data.c[, y[[2]]] * as.numeric(divide.up)
if(y[[4]] == 'post') year.data.b[, y[[1]]] = year.data.c[, y[[3]]] * as.numeric(divide.up)
if(y[[4]] == 'last') year.data.b[, y[[1]]] = year.data.c[, y[[2]]] * as.numeric(divide.up)

# extract estimates for current year along with the county column, then remove the last row
year.data.last <- year.data.b[names(year.data.b) %in% c("county", y[[1]])]
year.data.last <- year.data.last[-nrow(year.data.last),]
colnames(year.data.last) <- c('county', 'estimates')

# create a data set for export
this.year <- rep(as.numeric(substr(y[[1]], 2, 5)), nrow(year.data.last))
revised.data <- data.frame(state, this.year, year.data.last)
return(revised.data) 
}

my.list  <- apply(shares, 1, function(y) my.function(y))
my.list2 <- do.call("rbind", my.list)
my.list2

      state this.year county   estimates
1  my.state      1980     aa   96.470588
3  my.state      1980     ee 1543.529412
2  my.state      1990     cc   14.364641
31 my.state      1990     ee 1149.171271
4  my.state      1990     gg 1436.464088
11 my.state      2000     aa   23.148148
21 my.state      2000     cc    4.629630
41 my.state      2000     gg  462.962963
6  my.state      2000     kk    9.259259

这是Arun回答中提出的另一种功能。使用此函数，我使用data.frame shares访问函数内的sapply，以允许我将列名称视为变量。但是，此功能不适合该任务，因为我的实际数据有两种类型的缺失观察，并且此功能无法区分这两者。上面的函数可以区分这两者，因为我在列表d.counties中明确指出了相关的缺失观察值。在我的示例数据集中，我假设所有缺失的观察结果都是相同的类型，因此两个函数都返回相同的估计值。

# data set
my.data <- read.table(text = '
 county  y1970  y1980  y1990  y2000  y2010
   aa      50     NA     70     NA     550
   cc      10     20     NA     NA     100
   ee     800     NA     NA    400    9000
   gg    1000   1900     NA     NA   12000
   ii     200    400    300    100    1500
   kk      20     40     30     NA     100
total    2080   4000   3000   1000   23250
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)

# extract columns with NA's
my.data2 <- my.data[(which(colSums(is.na(my.data), na.rm=TRUE) > 0))]

# For each column containing NA's define the years to use as shares
# If use.years = 'pre'  then use the year in pre.year
# If use.years = 'post' then use the year in post.year
# If use.years = 'both' then use both the year in pre.year and the year in post.year
#
# Here I define pre.year = y1970 and post.year = 2010 for every year
# However, 'pre.year' and 'post.year' are variables.  They can differ among rows below.
shares <- read.table(text = '
      cyear   pre.year   post.year   use.years
      y1980     y1970      y2010        pre
      y1990     y1970      y2010        post
      y2000     y1970      y2010        both
', header = TRUE, na.strings = "NA")

# extract last row of my.data2
my.total   <- my.data2[nrow(my.data),]

# For each column sum all but the last row of my.data2
my.colsums <- colSums(my.data2[1:(nrow(my.data2)-1),], na.rm = TRUE)

# For each column in my.data2 calculate the number to be divided among rows with NA's
divide.up  <- my.total - my.colsums

my.function <- function(x) { 
    idx <- which(is.na((my.data2)[x]))
    names.x <- as.character(colnames(my.data2)[x])

    my.pre.col  <- as.character(shares$pre.year[shares$cyear==names.x])
    my.post.col <- as.character(shares$post.year[shares$cyear==names.x])
    my.use.year <- as.character(shares$use.years[shares$cyear==names.x])

    xx.pre  <- my.data[[my.pre.col]][idx]
    xx.post <- my.data[[my.post.col]][idx]

    if(my.use.year=='pre' ) my.data2[[x]][idx] =   (xx.pre /sum(xx.pre ))                                * divide.up[[x]]
    if(my.use.year=='post') my.data2[[x]][idx] =   (xx.post/sum(xx.post))                                * divide.up[[x]]
    if(my.use.year=='both') my.data2[[x]][idx] = (((xx.pre /sum(xx.pre )) + (xx.post/sum(xx.post))) / 2) * divide.up[[x]]

    return(my.data2[x])

}

na.estimates <- sapply(1:ncol(my.data2), function(x) {my.function(x)})
revised.data  <- t(do.call("rbind", na.estimates))
revised.data

          y1980      y1990       y2000
[1,]   96.47059   70.00000   22.358388
[2,]   20.00000   12.32227    4.275599
[3,] 1543.52941 1109.00474  400.000000
[4,] 1900.00000 1478.67299  466.775599
[5,]  400.00000  300.00000  100.000000
[6,]   40.00000   30.00000    6.590414
[7,] 4000.00000 3000.00000 1000.000000

在列表列表上调用函数

3 个答案:

问题的要点：

标准：

可能的解决方案：