Question

在访问网站时，我正在尝试从一系列其他列中找到与lastseen中的值最接近的匹配项（d1，d2，d3 ，d4，d5），以创建一个新列nextvisit，其值来自d1，d2，d3，{{ 1}}或d4，它是d5中第二大的值（即，最后一次见到某人之后的下一次访问）。

可复制的示例：

lastseen

所以我要寻找的答案是：

indiv lastseen d1  d2  d3  d4   d5
A     2         2   4   5   8   10
B     5         2   3   5   7    9
C     9         1   6   9  11   15

同样，对于单个A，{2}上方indiv lastseen d1 d2 d3 d4 d5 nextvisit A 2 2 4 5 8 10 4 B 5 2 3 5 7 9 7 C 9 1 6 9 11 15 11列中的第二大数字是

我尝试使用tidyr和dplyr，但无法有效地找到下一个最大的匹配项。

谢谢

Answer 1

请考虑df是您的data.frame。这是完整的R base方法

> ind <- (df[, -c(1,2)]- df[, 2])>0
> df$nextvisit <- apply(df[, -c(1,2)]*ind, 1, function(x) min(x[x!=0]))
> df
  indiv lastseen d1 d2 d3 d4 d5 nextvisit
1     A        2  2  4  5  8 10         4
2     B        5  2  3  5  7  9         7
3     C        9  1  6  9 11 15        11

Answer 2

另一个library(devtools) library(ggvegan) p2=autoplot(rda1, arrows = FALSE,axes = c(1, 2), geom = "point","text", layers = c("sites", "biplot", "centroids"))+ geom_point(data = dd2,aes(RDA1,RDA2,shape = site1,colour = site1),size=3)+ ylim(-3,3.0)+ xlim(-4,4.0)+ geom_abline(intercept = 0,slope = 0,linetype="dashed", size=0.8)+ geom_vline(aes(xintercept=0), linetype="dashed", size=0.8)+ labs(x = "Axis 1 (13.8%)", y="Axis 2 (2.9%)")+ theme_bw(base_size = 20)+ theme(legend.justification = c(0,0), legend.position = c(.10,.07), axis.text = element_text(family = "Times", colour = "black", size = 20), axis.title = element_text(family = "Times", size = 25), text = element_text(family = "Times", size = 20));p2选项

base R

数据

idx <- which(DF[-c(1, 2)] == DF$lastseen, arr.ind = TRUE)
idx[, "col"] <- idx[, "col"] + 1 # lastseen + 1 = next visit (in terms of column positions)
DF$nextvisit <- DF[-c(1, 2)][idx]
DF
#  indiv lastseen d1 d2 d3 d4 d5 nextvisit
#1     A        2  2  4  5  8 10         4
#2     B        5  2  3  5  7  9         7
#3     C        9  1  6  9 11 15        11

Answer 3

使用tidyverse，gather的列'd1'至'd5'为'long'格式，按'indiv'分组，在'val'和'last last '，slice具有最小正值的行，select感兴趣的列并与原始数据集进行联接

library(tidyverse)
df1 %>% 
   gather(key, val, d1:d5) %>%
   group_by(indiv) %>% 
   mutate(Diff = val -lastseen, 
          Diff = replace(Diff, Diff <=0, NA)) %>% 
   slice(which.min(Diff)) %>% 
   select(indiv, val) %>% 
   right_join(df1) %>%
   select(names(df1), everything())
# A tibble: 3 x 8
# Groups:   indiv [3]
#  indiv lastseen    d1    d2    d3    d4    d5   val
#  <chr>    <int> <int> <int> <int> <int> <int> <int>
#1 A            2     2     4     5     8    10     4
#2 B            5     2     3     5     7     9     7
#3 C            9     1     6     9    11    15    11

另一个选择是使用max.col中的base R。获取对象中的“ d”列与“最后一次看到的列”之间的差异（“ m1”），将小于或等于0的值替换为非常大的数字，使用max.col获取具有最大值的每一行的列索引（反向逻辑-将其更改为负数），cbind与行索引并从对应的'd'列中提取值。

m1 <- df1[3:7] -df1$lastseen
m1[m1 <=0] <- 999
df1$val <- df1[3:7][cbind(seq_len(nrow(df1)), max.col(-m1, 'first'))]
df1$val
#[1]  4  7 11

数据

df1 <- structure(list(indiv = c("A", "B", "C"), lastseen = c(2L, 5L, 
9L), d1 = c(2L, 2L, 1L), d2 = c(4L, 3L, 6L), d3 = c(5L, 5L, 9L
), d4 = c(8L, 7L, 11L), d5 = c(10L, 9L, 15L)), 
class = "data.frame", row.names = c(NA, -3L))

Answer 4

df = read.table(text = "
indiv lastseen d1  d2  d3  d4   d5
                A     2         2   4   5   8   10
                B     5         2   3   5   7    9
                C     9         1   6   9  11   15
                ", header=T)

library(tidyverse)

df %>%
  group_by(indiv, lastseen) %>%  # for each combination
  nest() %>%                     # nest data
  mutate(nextvisit = map2(lastseen, data, ~{vec = unlist(.y); min(vec[vec > .x])})) %>%  # get the minimum value higher than the corresponding lastseen value
  unnest()                       # unnest data

# # A tibble: 3 x 8
#   indiv lastseen nextvisit    d1    d2    d3    d4    d5
#   <fct>    <int>     <int> <int> <int> <int> <int> <int>
# 1 A            2         4     2     4     5     8    10
# 2 B            5         7     2     3     5     7     9
# 3 C            9        11     1     6     9    11    15

Answer 5

另一种基本的R方式：

df$lastvisit <- apply(df[,-1], 1, function(x) min(tail(x,5)[tail(x,5)>head(x,1)]))

或更不易读，但更简短：

df$lastvisit <- apply(df[,-1], 1, function(x) min(x[-1][x[-1]>x[1]]))

Answer 6

以下是结合了data.table和Find()函数的解决方案：

library(data.table)
setDT(df1)[, nextvisit := Find(function(x) x > lastseen, .SD), .SDcols = d1:d5, by = indiv]
df1[]

   indiv lastseen d1 d2 d3 d4 d5 nextvisit
1:     A        2  2  4  5  8 10         4
2:     B        5  2  3  5  7  9         7
3:     C        9  1  6  9 11 15        11

Find()在每行的function(x) x > lastseen列上从左到右在.SD列上应用过滤器功能.SD，并返回满足条件的第一个元素。 .SDcols = d1:d5列由sort(.SD)指定。

请注意，每行中的值都必须已经按从左到右的升序排序。如果不确定，请使用.SD。可以替换为{{1}}。

Answer 7

tidyverse解决方案，已完全矢量化，此处未转换为矩阵：

library(tidyverse)
df1 %>% 
  transmute_at(vars(starts_with("d")), ~ ifelse(.x>.y, .x, Inf), .$lastseen) %>%
  invoke(pmin,.) %>%
  bind_cols(df1,nextvisit=.)

#      indiv lastseen d1 d2 d3 d4 d5 nextvisit
#    1     A        2  2  4  5  8 10         4
#    2     B        5  2  3  5  7  9         7
#    3     C        9  1  6  9 11 15        11

多列中与另一列最匹配的查找值

7 个答案:

数据