当我将帖子https://stackoverflow.com/questions/47722530/count-match-ratio-between-two-dataset的答案应用于实际数据时,它有一些错误。
因为真实数据非常大,所以数据集的结构
> str(sesmic_events)
'data.frame': 13 obs. of 3 variables:
$ TIME_ : POSIXct, format: "2013-10-15 00:12:32" "2013-10-25 17:10:19" "2014-04-11 07:07:23" "2014-04-12 20:14:39" ...
$ LATITUDE : num 9.88 37.16 -6.59 -11.27 -6.75 ...
$ LONGITUDE: num 124 145 155 162 155 ...
> str(events_data)
'data.frame': 53646 obs. of 17 variables:
$ LONGITUDE : num 118 118 118 118 118 ...
$ LATITUDE : num -11.5 -11.5 -11.5 -11.5 -11.5 -11.5 -11.5 -11.5 -11.5 -11.5 ...
$ DATE_START : POSIXct, format: "2014-12-30" "2014-12-31" "2015-01-01" "2015-01-02" ...
$ DATE_END : POSIXct, format: "2015-01-06" "2015-01-07" "2015-01-08" "2015-01-09" ...
$ FLAG : num 2 2 2 2 2 2 2 2 2 2 ...
$ SURFSKINTEMP: int 13 1 16 16 7 13 9 9 9 9 ...
$ SURFAIRTEMP : int 6 6 6 6 6 6 6 4 5 2 ...
$ TOTH2OVAP : int 5 17 17 17 17 17 17 2 2 12 ...
$ TOTO3 : int 16 16 16 10 7 7 7 14 14 14 ...
$ TOTCO : int 12 12 8 4 12 12 10 10 10 10 ...
$ TOTCH4 : int 13 14 6 6 11 7 14 14 14 14 ...
$ OLR_ARIS : int 10 4 4 7 5 10 6 4 4 4 ...
$ CLROLR_ARIS : int 10 4 4 7 5 10 6 4 4 4 ...
$ OLR_NOAA : int 10 10 10 10 7 9 9 9 9 6 ...
$ MODIS_LST : int 1 1 1 1 1 1 1 1 1 1 ...
$ MATCH_COUNT : num 2 16 14 9 8 9 12 11 16 11 ...
$ MATCH_RATIO : num 0.00627 0.05016 0.04389 0.02821 0.02508 ...
数据集的负责人
> dput(head(sesmic_events))
structure(list(TIME_ = structure(c(1381795952.05, 1382721019.71,
1397200043.13, 1397333679.3, 1397914080.81, 1416018701.72), class = c("POSIXct",
"POSIXt")), LATITUDE = c(9.88, 37.156, -6.586, -11.27, -6.755,
1.893), LONGITUDE = c(124.1167, 144.6611, 155.0485, 162.1481,
155.0241, 126.5217)), .Names = c("TIME_", "LATITUDE", "LONGITUDE"
), row.names = c(NA, 6L), class = "data.frame")
> dput(head(events_data))
structure(list(LONGITUDE = c(118.5, 118.5, 118.5, 118.5, 118.5,
118.5), LATITUDE = c(-11.5, -11.5, -11.5, -11.5, -11.5, -11.5
), DATE_START = structure(c(1419897600, 1419984000, 1420070400,
1420156800, 1420243200, 1420329600), class = c("POSIXct", "POSIXt"
)), DATE_END = structure(c(1420502400, 1420588800, 1420675200,
1420761600, 1420848000, 1420934400), class = c("POSIXct", "POSIXt"
)), FLAG = c(2, 2, 2, 2, 2, 2), SURFSKINTEMP = c(13L, 1L, 16L,
16L, 7L, 13L), SURFAIRTEMP = c(6L, 6L, 6L, 6L, 6L, 6L), TOTH2OVAP = c(5L,
17L, 17L, 17L, 17L, 17L), TOTO3 = c(16L, 16L, 16L, 10L, 7L, 7L
), TOTCO = c(12L, 12L, 8L, 4L, 12L, 12L), TOTCH4 = c(13L, 14L,
6L, 6L, 11L, 7L), OLR_ARIS = c(10L, 4L, 4L, 7L, 5L, 10L), CLROLR_ARIS = c(10L,
4L, 4L, 7L, 5L, 10L), OLR_NOAA = c(10L, 10L, 10L, 10L, 7L, 9L
), MODIS_LST = c(1L, 1L, 1L, 1L, 1L, 1L), MATCH_COUNT = c(2,
16, 14, 9, 8, 9), MATCH_RATIO = c(0.00626959247648903, 0.0501567398119122,
0.0438871473354232, 0.0282131661442006, 0.0250783699059561, 0.0282131661442006
)), .Names = c("LONGITUDE", "LATITUDE", "DATE_START", "DATE_END",
"FLAG", "SURFSKINTEMP", "SURFAIRTEMP", "TOTH2OVAP", "TOTO3",
"TOTCO", "TOTCH4", "OLR_ARIS", "CLROLR_ARIS", "OLR_NOAA", "MODIS_LST",
"MATCH_COUNT", "MATCH_RATIO"), row.names = c(NA, 6L), class = "data.frame")
当我使用以下声明时
long <- events_data$LONGITUDE
lat <- events_data$LATITUDE
date_s <- events_data$DATE_START
date_e <- events_data$DATE_END
myVals <- events_data$MATCH_RATIO
sesmic_events$MatchRatio <- apply(sesmic_events, 1, function(x) {
v <- which(long > (x[3] - 5) & long < (x[3] + 5))
z <- which(lat > (x[2] - 5) & lat < (x[2] + 5))
t_s <- which(date_s >= x[1] - 60)
t_e <- which(date_e <= x[1])
ind <- Reduce(intersect, list(z,v,t_s,t_e))
sum(myVals[ind] > 0.01)/length(ind)
})
它有错误:
Error in x[3] - 5 : non-numeric argument to binary operator
答案 0 :(得分:0)
问题是x
被强制转移到character
。观察:
sesmic_events$MatchRatio <- apply(sesmic_events, 1, function(x) {
## Add print statement to see what's going on
print(class(x))
v <- which(long > (x[3] - 5) & long < (x[3] + 5))
z <- which(lat > (x[2] - 5) & lat < (x[2] + 5))
t_s <- which(date_s >= x[1] - 60)
t_e <- which(date_e <= x[1])
ind <- Reduce(intersect, list(z,v,t_s,t_e))
sum(myVals[ind] > 0.01)/length(ind)
})
[1] "character"
Error in x[3] - 5 : non-numeric argument to binary operator
只需将日期字段转换为整数即可解决问题。观察:
long <- events_data$LONGITUDE
lat <- events_data$LATITUDE
date_s <- as.integer(events_data$DATE_START)
date_e <- as.integer(events_data$DATE_END)
myVals <- events_data$MATCH_RATIO
sesmic_events$TIME_ <- as.integer(sesmic_events$TIME_)
sesmic_events$MatchRatio <- apply(sesmic_events, 1, function(x) {
v <- which(long > (x[3] - 5) & long < (x[3] + 5))
z <- which(lat > (x[2] - 5) & lat < (x[2] + 5))
t_s <- which(date_s >= x[1] - 60)
t_e <- which(date_e <= x[1])
ind <- Reduce(intersect, list(z,v,t_s,t_e))
sum(myVals[ind] > 0.01)/length(ind)
})
唯一的问题是基于样本数据,MatchRatio
字段为每个值返回NaN
,每个案例都为length(ind) = 0
。
无论如何,OP应该认真考虑使用可能data.table
或dplyr
的更强大的解决方案。