Question

我有一个表格，其中包含每种类型的描述统计数据（a，b和c的含义）

### stats
type <- c("a","b","c","d","e","f","g","h","i","j","k","l")
mean_a <- c(0,1,1,0,2,2,0,4,4,0,5,5)
mean_b<- c(4,7,8,0,3,10,5,4,7,0,1,6)
mean_c<- c(1,2,0,3,4,5,1,24,3,0,4,5)
stats <- data.frame(type, mean_a, mean_b, mean_c)

我有一个数据集，其中包含参数a，b和c的样本观察结果。每个样本都有特定的类型

# data
Id <- c("ted","bert","test","john","elf","fea","goul","houl","ili","jok","ko","lol")
type <- c("a","a","b","d","f","f","c","d","a","b","k","l")
a <- c(2,1,3,2,1,2,0,1,2,1,5,5)
b<- c(1,3,4,7,5,4,5,6,5,0,1,6)
c<- c(3,5,2,6,8,5,1,5,3,1,6,6)
data <- data.frame(Id, type, a, b, c )

根据这两个表格，我想从data获取stats中最具代表性的样本# output wanted Id <- c("ted","test","john") type <- c("a","b","c") a <- c(2,3,2) b<- c(1,4,7) c<- c(3,2,6) data2 <- data.frame(Id, type, a, b, c )。最具代表性的是，我希望得到a，b和c最接近其各自平均值的值。

我无法在3个平均值（a，b和c）上找到互联网上的想法。欢迎帮助！需要输出（但不确定ted，test和john是否最接近a，b和c类型的平均值）：

Add-Type -Path OpcLabs.EasyOpcClassic.dll

# Create EasyOPC-DA component 
$client = New-Object OpcLabs.EasyOpc.DataAccess.EasyDAClient

# Read item value and display it
$client.ReadItemValue("", "OPCLabs.KitServer.2", "Demo.Single")

Answer 1

最具代表性的＆＃34;正如你自己提到的那样非常模糊，但这是一次尝试，它找出data的值和stats的mean_values之间的差异，并保持平均值最低的值。由于我事先加入了数据框，您可以使用代码末尾的select()函数并相应地修改（保留/删除变量）。

library(dplyr)
df1 <- merge(data1, stats, by = 'type')
df1 %>% 
  mutate(new = abs(rowMeans(mapply(`-`, df1[,(3:5)], df1[,(6:8)])))) %>% 
  group_by(type) %>% 
  filter(new == min(new)) %>% 
  select(-new)

#Source: local data frame [7 x 8]
#Groups: type [7]

#    type     Id     a     b     c mean_a mean_b mean_c
#  <fctr> <fctr> <dbl> <dbl> <dbl>  <dbl>  <dbl>  <dbl>
#1      a    ted     2     1     3      0      4      1
#2      b   test     3     4     2      1      7      2
#3      c   goul     0     5     1      1      8      0
#4      d   houl     1     6     5      0      0      3
#5      f    elf     1     5     8      2     10      5
#6      k     ko     5     1     6      5      1      4
#7      l    lol     5     6     6      5      6      5

Answer 2

require(dplyr)

inner_join(stats, data) %>% 
  rowwise %>% 
  mutate(diff = sum((a - mean_a)^2,
                    (b - mean_b)^2,
                    (c - mean_c)^2)) %>% 
  group_by(type) %>% 
  filter(diff == min(diff)) %>% 
  select(Id, type, a, b, c)



#       Id  type     a     b     c
#   <fctr> <chr> <dbl> <dbl> <dbl>
# 1    ili     a     2     5     3
# 2   test     b     3     4     2
# 3   goul     c     0     5     1
# 4   houl     d     1     6     5
# 5    elf     f     1     5     8
# 6     ko     k     5     1     6
# 7    lol     l     5     6     6

Answer 3

我选择了距离的二次测量，您可能希望将其调整为您喜欢的任何距离测量：

data$dist <- (data$a - stats[data$type, "mean_a"])^2 + 
             (data$b - stats[data$type, "mean_b"])^2 +
             (data$c - stats[data$type, "mean_c"])^2 
closest <- which.min(data$dist)
print(paste0("Closest is number ",closest, ": ",data[closest, "Id"] ))

R：找到最接近平均值的观察值

3 个答案: