基于两个数据帧上的组索引的最小差异

时间:2019-05-26 20:59:15

标签: r dataframe

我有两个数据框

A
            X1     Year_month
1          19.3945   1999_1
2          19.379    1999_1
3          19.2073   1999_1
4          19.267    2000_1
5          18.760    2000_1
6          19.3505   2000_1

和B

 Longitude   Year_month      CHL
1   12.3125     1999_1    12.70245
2    12.375     1999_1    12.63853
3   12.4375     1999_1    12.58700
4      12.5     2000_1    12.61019
5   12.5625     2000_1    12.75727
6    12.625     2000_1    13.06914

我想基于组索引A$X1计算B$Longitude的每个值和Year_month的所有值之间的最小差,并在{{ 1}}的值A

让我们说,当B$CHL减去A$X1的每个值时,对于同一个B$longitude组来说,值的值是最小差,我在列Year_month中输入A$res的值 例如,对于df A的第一行:

B$CHL

A[1,1]-B[1,1] A[1,1]-B[2,1] A[1,1]-B[3,1] ---> this is the minimum difference 列中,我将结果12.58(值A$res)放在第B[3,3]行中,依此类推

A$X1

但是我有错误:

I tried this code: 
A$res<- as.data.frame(lapply(A, function(x){
  if(as.numeric(as.character(A$Year_month)) == as.numeric(as.character(B$Year_month))){
    return(B$CHL[unlist(lapply(as.numeric(as.character(B$Longitude)), function(t) which.min(abs(A$X1-t))))])
  } else{
    return(NA)
  }
}))

任何想法?

PS:

Error in if (as.numeric(as.character(A$Year_month)) == as.numeric(as.character(B$Year_month))) { : 
  missing values where is required TRUE/FALSE
Furthermore Warning messages:
1: In FUN(X[[i]], ...) :  NA for coercion
2: In FUN(X[[i]], ...) : NA for coercion
3: In if (as.numeric(as.character(A$Year_month)) == as.numeric(as.character(B$Year_month))) { :
  the condition of length > 1 only the first element is used

3 个答案:

答案 0 :(得分:1)

我使用varhandle包轻松地将因子转换为实数。

代码在这里:

library(varhandle)

# The data
A <- data.frame("X1"=c("19.3945","19.379", "19.2073", "19.267", "18.760", "19.3505"), 
                "Year_month" = c("1999_1", "1999_1", "1999_1", "2000_1", "2000_1", "2000_1"))
sapply(A, class)
#        X1 Year_month 
#  "factor"   "factor"

B <- data.frame( "Longitude"=c("12.3125", "12.375", "12.4375","12.5", "12.5625", "12.625"  ),
                 "Year_month"=c("1999_1", "1999_1", "1999_1", "2000_1", "2000_1", "2000_1"),
                 "CHL"=c( 12.70245, 12.63853, 12.58700, 12.61019, 12.75727, 13.06914))
sapply(B, class)
#  Longitude Year_month        CHL 
#   "factor"   "factor"  "numeric"

# Convert factor to real
A$X1 = unfactor(A$X1)
B$Longitude = unfactor(B$Longitude)

# Function to apply
getCHL <- function(row){
  # Select matching row on "Year_month"
  sub_df <- B[B$Year_month == row["Year_month"], ]
  # Select indice
  ind <- which.min(as.double(row["X1"]) - sub_df$Longitude)
  return( sub_df$CHL[ind] )
}
# Apply the function
A["CHL"] <- apply(A, MARGIN = 1, getCHL)

答案 1 :(得分:1)

避免任何循环,因为您的需求实质上是基于集合的数据帧计算(聚合级别连接到单元级别)。具体来说,先考虑mergeaggregate,然后再考虑merge

# MERGE THEN CALCULATE ROW-WISE DIFFERENCE
mdf <- within(merge(dfA, dfB, by="Year_month"), {
        Res <- X1 - Longitude
})

# AGGREGATE TO FIND MINIMUM RES
aggdf <- aggregate(Res ~ Year_month + X1, mdf, min)

# MERGE AGGREGATION BACK TO UNIT LEVEL BY SAME COLUMNS
final_df <- merge(aggdf, mdf, by=c("Year_month", "Res", "X1"))  #by ARG IS REDUNDANT
final_df
#   Year_month    Res      X1 Longitude      CHL
# 1     1999_1 6.7698 19.2073   12.4375 12.58700
# 2     1999_1 6.9415 19.3790   12.4375 12.58700
# 3     1999_1 6.9570 19.3945   12.4375 12.58700
# 4     2000_1 6.1350 18.7600   12.6250 13.06914
# 5     2000_1 6.6420 19.2670   12.6250 13.06914
# 6     2000_1 6.7255 19.3505   12.6250 13.06914

Rextester demo

答案 2 :(得分:1)

这是一个"strictMetadataEmit" : true解决方案。它与@Parfait相似,不同之处在于我在前面添加了一个ID,这样我就可以将其压缩下来而不必重新连接到原始数据表。

@Optional()

这是基本的R解决方案,与之类似。

data.table

最后,由于数据类型存在一些问题,因此有以下几种转换方法:

library(data.table)
A_dt[, ID := seq_len(.N), by = Year_month]

A_dt[B_dt
     , on = 'Year_month'
     , .(Year_month, ID, Res = X1 - Longitude, X1, Longitude, CHL)
     , allow.cartesian = T
     ][, .SD[which.min(Res), ] , by = .(Year_month, ID)]

   Year_month ID    Res      X1 Longitude      CHL
1:     1999_1  1 6.9570 19.3945   12.4375 12.58700
2:     1999_1  2 6.9415 19.3790   12.4375 12.58700
3:     1999_1  3 6.7698 19.2073   12.4375 12.58700
4:     2000_1  1 6.6420 19.2670   12.6250 13.06914
5:     2000_1  2 6.1350 18.7600   12.6250 13.06914
6:     2000_1  3 6.7255 19.3505   12.6250 13.06914