我正在使用'股票模型'来估算缺失观察值。使用示例数据集my.data
,我在三年中每年填写缺失的观测值,与1970年观测资料的分配方式成比例(尽管我可以使用2010年或1970年和2010年这样做)。
下面我提供示例数据,以两种方式获得所需估计的期望结果和代码。第一种方法的代码非常特定于此示例。我希望创建一个比第二种方法更常用的功能。在我看来,创建一个更通用的函数需要在列表列表上调用函数。我希望有人可以提供有关如何将函数应用于列表列表的建议。
以下是示例数据集和高度特定的解决方案:
my.data <- read.table(text = '
county y1970 y1980 y1990 y2000 y2010
aa 50 NA 70 NA 500
cc 10 20 NA NA 100
ee 800 NA NA 400 8000
gg 1000 1900 NA NA 10000
ii 200 400 300 100 2000
kk 20 40 30 NA 200
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)
my.total <- read.table(text = '
county y1970 y1980 y1990 y2000 y2010
total 2080 4000 3000 1000 20800
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)
desired.result <- read.table(text = '
county y1970 y1980 y1990 y2000 y2010
aa 50 96.47059 70 23.148148 500
cc 10 20 14.36464 4.629630 100
ee 800 1543.529 1149.17127 400 8000
gg 1000 1900 1436.46409 462.962963 10000
ii 200 400 300 100 2000
kk 20 40 30 9.259259 200
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)
x70 <- c(50, 800)
estimates.for.80 <- (x70 / sum(x70)) * (my.total$y1980 - sum(my.data$y1980, na.rm = TRUE))
x80 <- c(10, 800, 1000)
estimates.for.90 <- (x80 / sum(x80)) * (my.total$y1990 - sum(my.data$y1990, na.rm = TRUE))
x90 <- c(50, 10, 1000, 20)
estimates.for.00 <- (x90 / sum(x90)) * (my.total$y2000 - sum(my.data$y2000, na.rm = TRUE))
这是功能。如果我知道如何将d.counties
作为输入列表包含在函数中,我认为这可以推广。换句话说,如何在d.counties
中加入my.input
并仍然可以使用该功能?我认为我的困惑源于d.counties
长度不同的年份。
state <- 'my.state'
my.df <- read.table(text = '
county y1970 y1980 y1990 y2000 y2010
aa 50 NA 70 NA 500
cc 10 20 NA NA 100
ee 800 NA NA 400 8000
gg 1000 1900 NA NA 10000
ii 200 400 300 100 2000
kk 20 40 30 NA 200
total 2080 4000 3000 1000 20800
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)
pre.divide.up <- tail(my.df[,2:ncol(my.df)], 1) - colSums(head(my.df[,2:ncol(my.df)], -1), na.rm = TRUE)
# For each column containing NA's define the years to use as shares
# If use.years = 'pre' then use the year in pre.year
# If use.years = 'post' then use the year in post.year
# If use.years = 'both' then use both the year in pre.year and the year in post.year
#
# Here I define pre.year = y1970 and post.year = 2010 for every year
# However, 'pre.year' and 'post.year' are variables. They can differ among rows below.
shares <- read.table(text = '
cyear pre.year post.year use.years
y1980 y1970 y2010 pre
y1990 y1970 y2010 pre
y2000 y1970 y2010 pre
', header = TRUE, na.strings = "NA")
d.counties.80 <- c( 'aa' ,
'ee' )
d.counties.90 <- c( 'cc' ,
'ee' ,
'gg' )
d.counties.00 <- c( 'aa' ,
'cc' ,
'gg' ,
'kk' )
d.counties <- list(d.counties.80, d.counties.90, d.counties.00)
my.input <- data.frame(shares)
my.function <- function(y) {
# extract years of interest from my.df and store in data.frame called year.data
if(y[[4]] != 'last') year.data = my.df[names(my.df) %in% c("county", y[[2]], y[[1]], y[[3]])]
if(y[[4]] == 'last') year.data = my.df[names(my.df) %in% c("county", y[[2]], y[[1]] )]
# subset counties in year.data to only include counties with NA's in current year
if(as.numeric(substr(y[1], 2, 5)) == 1980) year.data = year.data[year.data$county %in% d.counties.80,]
if(as.numeric(substr(y[1], 2, 5)) == 1990) year.data = year.data[year.data$county %in% d.counties.90,]
if(as.numeric(substr(y[1], 2, 5)) == 2000) year.data = year.data[year.data$county %in% d.counties.00,]
# reorder columns in year.data
if(y[[4]] != 'last') year.data = year.data[, c('county', y[[2]], y[[1]], y[[3]])]
if(y[[4]] == 'last') year.data = year.data[, c('county', y[[2]], y[[1]] )]
# values to be divided, or distributed, among counties with NA's in the current year
divide.up <- pre.divide.up[, y[[1]]]
# sum values from designated pre and/or post years and bind those totals to bottom of year.data
if(y[[4]] != 'last') colsums.year = data.frame('total', as.data.frame(t(as.numeric(colSums(year.data[,c(2:4)], na.rm=TRUE)))))
if(y[[4]] == 'last') colsums.year = data.frame('total', as.data.frame(t(as.numeric(colSums(year.data[,c(2:3)], na.rm=TRUE)))))
names(colsums.year) <- names(year.data)
year.data.b <- rbind(year.data, colsums.year)
# obtain percentages in designated pre and/or post years for counties with NA's in current year
year.data.c <- year.data.b
year.data.c[, -1] <- lapply( year.data.c[ , -1], function(x){ x/x[nrow(year.data.b)] } )
# estimate county values for current year by distributing total missing values in current year
# according to how values were distributed in those same counties in other years
if(y[[4]] == 'both') year.data.b[, y[[1]]] = rowMeans(data.frame(year.data.c[, y[[2]]], year.data.c[, y[[3]]])) * as.numeric(divide.up)
if(y[[4]] == 'pre') year.data.b[, y[[1]]] = year.data.c[, y[[2]]] * as.numeric(divide.up)
if(y[[4]] == 'post') year.data.b[, y[[1]]] = year.data.c[, y[[3]]] * as.numeric(divide.up)
if(y[[4]] == 'last') year.data.b[, y[[1]]] = year.data.c[, y[[2]]] * as.numeric(divide.up)
# extract estimates for current year along with the county column, then remove the last row
year.data.last <- year.data.b[names(year.data.b) %in% c("county", y[[1]])]
year.data.last <- year.data.last[-nrow(year.data.last),]
colnames(year.data.last) <- c('county', 'acreage')
# create a data set for export
this.year <- rep(as.numeric(substr(y[[1]], 2, 5)), nrow(year.data.last))
revised.data <- data.frame(state, this.year, year.data.last)
return(revised.data)
}
my.list <- apply(shares, 1, function(y) my.function(y))
my.list2 <- do.call("rbind", my.list)
my.list2
state this.year county acreage
1 my.state 1980 aa 96.470588
3 my.state 1980 ee 1543.529412
2 my.state 1990 cc 14.364641
31 my.state 1990 ee 1149.171271
4 my.state 1990 gg 1436.464088
11 my.state 2000 aa 23.148148
21 my.state 2000 cc 4.629630
41 my.state 2000 gg 462.962963
6 my.state 2000 kk 9.259259
虽然这个功能不像我下面的答案那样通用,但上面的功能确实允许明确指定哪些县有相关的缺失值。在实际数据中有两种类型的缺失值,下面我的答案中的函数不能区分两种类型。上面的功能可以区分它们,因为我告诉它每年要考虑哪些县。
再次感谢您提出任何建议和建议。
答案 0 :(得分:4)
我认为你的整个问题可以归纳为几行。这太长了。如果您的问题确实如标题所示,将函数应用于列表列表,那么您需要递归地应用函数。有一个用于执行此操作的构造,它是rapply
:
w <- 1:5
x <- 1:5
y <- 6:10
z <- 6:10
ll <- list( list( w , x) , list( y , z) )
str(ll)
List of 2
$ :List of 2
..$ : int [1:5] 1 2 3 4 5
..$ : int [1:5] 1 2 3 4 5
$ :List of 2
..$ : int [1:5] 6 7 8 9 10
..$ : int [1:5] 6 7 8 9 10
rapply( ll , mean )
[1] 3 3 8 8
作为一个建议,基本上你可以把问题归结为......
我有这个列表列表,但是当我尝试使用lapply时它不起作用...
lapply( ll , mean )
[[1]]
[1] NA
[[2]]
[1] NA
Warning messages:
1: In mean.default(X[[1L]], ...) :
argument is not numeric or logical: returning NA
2: In mean.default(X[[2L]], ...) :
argument is not numeric or logical: returning NA
答案 1 :(得分:2)
这个问题搞砸了。我会首先尝试解释你的问题然后回答,以便更容易弄清楚我是否理解你的问题。
好的,在根据您的问题筛选出所需内容后,我发现您有一个data.frame my.data
:
county y1970 y1980 y1990 y2000 y2010
1 aa 50 NA 70 NA 500
2 cc 10 20 NA NA 100
3 ee 800 NA NA 400 8000
4 gg 1000 1900 NA NA 10000
5 ii 200 400 300 100 2000
6 kk 20 40 30 NA 200
另一个my.total
:
county y1970 y1980 y1990 y2000 y2010
1 total 2080 4000 3000 1000 20800
你想要的是(desired.result
):
county y1970 y1980 y1990 y2000 y2010
1 aa 50 96.47059 70.00000 23.148148 500
2 cc 10 20.00000 14.36464 4.629630 100
3 ee 800 1543.52900 1149.17127 400.000000 8000
4 gg 1000 1900.00000 1436.46409 462.962963 10000
5 ii 200 400.00000 300.00000 100.000000 2000
6 kk 20 40.00000 30.00000 9.259259 200
我理解你的标准的方法是,对于my.data
中包含NA的每个数字/整数列,取第1970列对应于那些NA条目的条目,并用公式替换那些NA条目: / p>
vals <- corresponding entries in column 1970
this column's NA's <- vals/sum(vals) * (my.total of this column -
sum(this.column, na.rm=TRUE))
我认为这里不需要“列表清单”。这是使用简单for循环的一种方式(因为它在概念上更容易解决这个问题)。这是因为您要修改data.frame的某些列中的某些项目。 df
将是desired.result
。
df <- my.data
for (i in which(colSums(is.na(df), na.rm=TRUE) > 0)) {
idx <- which(is.na(df[[i]]))
xx <- df[["y1970"]][idx]
df[[i]][idx] <- (xx/sum(xx)) * (my.total[[i]] - sum(df[[i]], na.rm=TRUE))
}
county y1970 y1980 y1990 y2000 y2010
1 aa 50 96.47059 70.00000 23.148148 500
2 cc 10 20.00000 14.36464 4.629630 100
3 ee 800 1543.52941 1149.17127 400.000000 8000
4 gg 1000 1900.00000 1436.46409 462.962963 10000
5 ii 200 400.00000 300.00000 100.000000 2000
6 kk 20 40.00000 30.00000 9.259259 200
答案 2 :(得分:0)
在我的原始帖子中,我询问如何将列表d.counties
作为输入包含在函数中,而不必使用一系列特定的if
语句。这是我提出的解决方案。
步骤1.创建列表d.counties
,以便保留名称:
d.counties.1980 <- c( 'aa' ,
'ee' )
d.counties.1990 <- c( 'cc' ,
'ee' ,
'gg' )
d.counties.2000 <- c( 'aa' ,
'cc' ,
'gg' ,
'kk' )
list.function <- function() {
sapply(c("d.counties.1980",
"d.counties.1990",
"d.counties.2000"), get, environment(), simplify = FALSE)
}
d.counties <- list.function()
步骤2.在函数内部替换一系列if
语句,这些语句明确指定各个年份的缺失观察值,使用以下通用名称访问列表d.counties
而不明确指定个别名称个人年份:
year.data = year.data[year.data$county %in% d.counties[substr(names(d.counties), 12, 15) == substr(y[1], 2, 5)][[1]],]
以下是此解决方案的完整代码:
state <- 'my.state'
my.df <- read.table(text = '
county y1970 y1980 y1990 y2000 y2010
aa 50 NA 70 NA 500
cc 10 20 NA NA 100
ee 800 NA NA 400 8000
gg 1000 1900 NA NA 10000
ii 200 400 300 100 2000
kk 20 40 30 NA 200
total 2080 4000 3000 1000 20800
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)
pre.divide.up <- tail(my.df[,2:ncol(my.df)], 1) - colSums(head(my.df[,2:ncol(my.df)], -1), na.rm = TRUE)
# For each column containing NA's define the years to use as shares
# If use.years = 'pre' then use the year in pre.year
# If use.years = 'post' then use the year in post.year
# If use.years = 'both' then use both the year in pre.year and the year in post.year
#
# Here I define pre.year = y1970 and post.year = 2010 for every year
# However, 'pre.year' and 'post.year' are variables. They can differ among rows below.
shares <- read.table(text = '
cyear pre.year post.year use.years
y1980 y1970 y2010 pre
y1990 y1970 y2010 pre
y2000 y1970 y2010 pre
', header = TRUE, na.strings = "NA")
d.counties.1980 <- c( 'aa' ,
'ee' )
d.counties.1990 <- c( 'cc' ,
'ee' ,
'gg' )
d.counties.2000 <- c( 'aa' ,
'cc' ,
'gg' ,
'kk' )
list.function <- function() {
sapply(c("d.counties.1980",
"d.counties.1990",
"d.counties.2000"), get, environment(), simplify = FALSE)
}
d.counties <- list.function()
d.counties
my.input <- data.frame(shares)
my.function <- function(y) {
# extract years of interest from my.df and store in data.frame called year.data
if(y[[4]] != 'last') year.data = my.df[names(my.df) %in% c("county", y[[2]], y[[1]], y[[3]])]
if(y[[4]] == 'last') year.data = my.df[names(my.df) %in% c("county", y[[2]], y[[1]] )]
# subset counties in year.data to only include counties with NA's in current year
year.data = year.data[year.data$county %in% d.counties[substr(names(d.counties), 12, 15) == substr(y[1], 2, 5)][[1]],]
# reorder columns in year.data
if(y[[4]] != 'last') year.data = year.data[, c('county', y[[2]], y[[1]], y[[3]])]
if(y[[4]] == 'last') year.data = year.data[, c('county', y[[2]], y[[1]] )]
# values to be divided, or distributed, among counties with NA's in the current year
divide.up <- pre.divide.up[, y[[1]]]
# sum values from designated pre and/or post years and bind those totals to bottom of year.data
if(y[[4]] != 'last') colsums.year = data.frame('total', as.data.frame(t(as.numeric(colSums(year.data[,c(2:4)], na.rm=TRUE)))))
if(y[[4]] == 'last') colsums.year = data.frame('total', as.data.frame(t(as.numeric(colSums(year.data[,c(2:3)], na.rm=TRUE)))))
names(colsums.year) <- names(year.data)
year.data.b <- rbind(year.data, colsums.year)
# obtain percentages in designated pre and/or post years for counties with NA's in current year
year.data.c <- year.data.b
year.data.c[, -1] <- lapply( year.data.c[ , -1], function(x){ x/x[nrow(year.data.b)] } )
# estimate county values for current year by distributing total missing values in current year
# according to how values were distributed in those same counties in other years
if(y[[4]] == 'both') year.data.b[, y[[1]]] = rowMeans(data.frame(year.data.c[, y[[2]]], year.data.c[, y[[3]]])) * as.numeric(divide.up)
if(y[[4]] == 'pre') year.data.b[, y[[1]]] = year.data.c[, y[[2]]] * as.numeric(divide.up)
if(y[[4]] == 'post') year.data.b[, y[[1]]] = year.data.c[, y[[3]]] * as.numeric(divide.up)
if(y[[4]] == 'last') year.data.b[, y[[1]]] = year.data.c[, y[[2]]] * as.numeric(divide.up)
# extract estimates for current year along with the county column, then remove the last row
year.data.last <- year.data.b[names(year.data.b) %in% c("county", y[[1]])]
year.data.last <- year.data.last[-nrow(year.data.last),]
colnames(year.data.last) <- c('county', 'estimates')
# create a data set for export
this.year <- rep(as.numeric(substr(y[[1]], 2, 5)), nrow(year.data.last))
revised.data <- data.frame(state, this.year, year.data.last)
return(revised.data)
}
my.list <- apply(shares, 1, function(y) my.function(y))
my.list2 <- do.call("rbind", my.list)
my.list2
state this.year county estimates
1 my.state 1980 aa 96.470588
3 my.state 1980 ee 1543.529412
2 my.state 1990 cc 14.364641
31 my.state 1990 ee 1149.171271
4 my.state 1990 gg 1436.464088
11 my.state 2000 aa 23.148148
21 my.state 2000 cc 4.629630
41 my.state 2000 gg 462.962963
6 my.state 2000 kk 9.259259
这是Arun回答中提出的另一种功能。使用此函数,我使用data.frame shares
访问函数内的sapply
,以允许我将列名称视为变量。但是,此功能不适合该任务,因为我的实际数据有两种类型的缺失观察,并且此功能无法区分这两者。上面的函数可以区分这两者,因为我在列表d.counties
中明确指出了相关的缺失观察值。在我的示例数据集中,我假设所有缺失的观察结果都是相同的类型,因此两个函数都返回相同的估计值。
# data set
my.data <- read.table(text = '
county y1970 y1980 y1990 y2000 y2010
aa 50 NA 70 NA 550
cc 10 20 NA NA 100
ee 800 NA NA 400 9000
gg 1000 1900 NA NA 12000
ii 200 400 300 100 1500
kk 20 40 30 NA 100
total 2080 4000 3000 1000 23250
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)
# extract columns with NA's
my.data2 <- my.data[(which(colSums(is.na(my.data), na.rm=TRUE) > 0))]
# For each column containing NA's define the years to use as shares
# If use.years = 'pre' then use the year in pre.year
# If use.years = 'post' then use the year in post.year
# If use.years = 'both' then use both the year in pre.year and the year in post.year
#
# Here I define pre.year = y1970 and post.year = 2010 for every year
# However, 'pre.year' and 'post.year' are variables. They can differ among rows below.
shares <- read.table(text = '
cyear pre.year post.year use.years
y1980 y1970 y2010 pre
y1990 y1970 y2010 post
y2000 y1970 y2010 both
', header = TRUE, na.strings = "NA")
# extract last row of my.data2
my.total <- my.data2[nrow(my.data),]
# For each column sum all but the last row of my.data2
my.colsums <- colSums(my.data2[1:(nrow(my.data2)-1),], na.rm = TRUE)
# For each column in my.data2 calculate the number to be divided among rows with NA's
divide.up <- my.total - my.colsums
my.function <- function(x) {
idx <- which(is.na((my.data2)[x]))
names.x <- as.character(colnames(my.data2)[x])
my.pre.col <- as.character(shares$pre.year[shares$cyear==names.x])
my.post.col <- as.character(shares$post.year[shares$cyear==names.x])
my.use.year <- as.character(shares$use.years[shares$cyear==names.x])
xx.pre <- my.data[[my.pre.col]][idx]
xx.post <- my.data[[my.post.col]][idx]
if(my.use.year=='pre' ) my.data2[[x]][idx] = (xx.pre /sum(xx.pre )) * divide.up[[x]]
if(my.use.year=='post') my.data2[[x]][idx] = (xx.post/sum(xx.post)) * divide.up[[x]]
if(my.use.year=='both') my.data2[[x]][idx] = (((xx.pre /sum(xx.pre )) + (xx.post/sum(xx.post))) / 2) * divide.up[[x]]
return(my.data2[x])
}
na.estimates <- sapply(1:ncol(my.data2), function(x) {my.function(x)})
revised.data <- t(do.call("rbind", na.estimates))
revised.data
y1980 y1990 y2000
[1,] 96.47059 70.00000 22.358388
[2,] 20.00000 12.32227 4.275599
[3,] 1543.52941 1109.00474 400.000000
[4,] 1900.00000 1478.67299 466.775599
[5,] 400.00000 300.00000 100.000000
[6,] 40.00000 30.00000 6.590414
[7,] 4000.00000 3000.00000 1000.000000