我有一个大型数据集,每个电台具有相同的纬度和经度。在数据集中,一些行缺少lat和lon,而是说'unknown'。我需要在没有丢失数据的其他站点填写lat long的未知数。
在这个例子中,我希望第5行为lat和lon插入3和8:
> station <- c("a","b","c","c","c")
> lat <- c("1","2","3","3","unknown")
> lon <- c("6","7","8","8","unknown")
> data.frame(station,lat,lon)
station lat lon
1 a 1 6
2 b 2 7
3 c 3 8
4 c 3 8
5 c unknown unknown
我的数据集中有一百万行,如果需要几分钟才能完成,那很好,因为这只会在分析开始前运行一次。除非真的有必要,否则我宁愿不安装另一个包。
答案 0 :(得分:2)
这样的事,也许 -
df$station <- as.character(df$station)
unknownstations <- unique(subset(df,df$lat == "unknown","station"))
unknownstationscoords <- unique(subset(df,station %in% unknownstations$station & lat != "unknown"))
for( i in unknownstations$station)
{
df[df$station == i,"lat"] <- subset(unknownstationscoords,station %in% i,"lat")
df[df$station == i,"lon"] <- subset(unknownstationscoords,station %in% i,"lon")
}
答案 1 :(得分:2)
我在动物园包中使用na.locf
。首先,我将unknown
更改为NA
,然后应用na.locf
:
> library(zoo)
> df[ df=="unknown"] <- NA
> df2 <- do.call(rbind, lapply(split(df, df$station), na.locf))
> df2[, -1] <- sapply(df2[, -1], as.numeric) # numeric variables should be numeric
> df2
station lat lon
a a 1 6
b b 2 7
c.3 c 3 8
c.4 c 3 8
c.5 c 3 8
如果您想要使用rownames,请使用rownames
并指定名称:
> rownames(df2) <- 1:nrow(df2)
> df2
station lat lon
1 a 1 6
2 b 2 7
3 c 3 8
4 c 3 8
5 c 3 8
答案 2 :(得分:0)
y=function(station,lat,lon){
temp=cbind(station,lat,lon)
lat_ind=lat!="unknown"
lon_ind=lon!="unknown"
if(all(lat_ind)==0){
hash=unique(temp[lat_ind,])
ind2=hash[,1]==station[!lat_ind]
temp[!lat_ind,]=temp[ind2,]
return(temp)
}else if(all(lon_ind)==0){
hash=unique(temp[lon_ind,])
ind2=hash[,1]==station[!lon_ind]
temp[!lon_ind,]=temp[ind2,]
return(temp)
}else {
return(temp)
}
}
##case1
station <- c("a","b","c","c","c")
lat <- c("1","2","3","3","unknown")
lon <- c("6","7","8","8","unknown")
y(station,lat,lon)
# station lat lon
# [1,] "a" "1" "6"
# [2,] "b" "2" "7"
# [3,] "c" "3" "8"
# [4,] "c" "3" "8"
# [5,] "c" "3" "8"
##case2
station <- c("a","b","c","c","c")
lat <- c("1","2","3","3","3")
lon <- c("6","7","8","8","unknown")
y(station,lat,lon)
# station lat lon
# [1,] "a" "1" "6"
# [2,] "b" "2" "7"
# [3,] "c" "3" "8"
# [4,] "c" "3" "8"
# [5,] "c" "3" "8"
##case3
station <- c("a","b","c","c","c")
lat <- c("1","2","3","3","unknown")
lon <- c("6","7","8","8","8")
y(station,lat,lon)
# station lat lon
# [1,] "a" "1" "6"
# [2,] "b" "2" "7"
# [3,] "c" "3" "8"
# [4,] "c" "3" "8"
# [5,] "c" "3" "8"