我总是遇到循环问题所以我在这里问。 2个数据帧。 1个非常大,1个小得多。以下示例版本。
Dataframe 1
ID Value
1 apples
1 apples
1 bananas
1 grapes
1 mangoes
1 oranges
1 grapes
1 apples
1 grapes
2 apples
2 apples
2 passionfruits
2 bananas
2 apples
2 apples
2 passionfruits
2 grapes
2 mangoes
2 apples
3 apples
3 bananas
3 oranges
3 apples
3 grapes
3 grapes
3 passionfruits
3 passionfruits
3 oranges
4 apples
4 oranges
4 mangoes
4 bananas
4 grapes
4 grapes
4 grapes
4 apples
4 oranges
4 grapes
4 mangoes
4 mangoes
4 apples
4 oranges
5 passionfruits
5 apples
5 oranges
5 oranges
5 mangoes
5 grapes
5 apples
5 bananas
Dataframe 2
Value
apples
apples
bananas
grapes
mangoes
mangoes
grapes
apples
apples
数据帧1中的不同ID被视为集合。数据帧2的整体将与其中一个集合近似或完全匹配。我知道有足够的代码可以使用整个数据帧2进行过滤以匹配1. 但这不是我要求的。我要求它在附加条件的情况下依次按值过滤。条件应该是前一个值是否匹配。
所以在这个示例中,第一个值没有任何反应,因为所有ID都有'apples'。第二个值='apples',前一个值='apples'过滤掉ID = 4,因为它不包含连续出现两次的'apples'。现在在过滤的数据帧1中,我们搜索第三个值,依此类推。只有当1 ID设置保留在Dataframe 1中时才会停止。所以在这种情况下,在第3次迭代之后。结果应该是
Dataframe 1
ID Value
1 apples
1 apples
1 bananas
1 grapes
1 mangoes
1 oranges
1 grapes
1 apples
1 grapes
答案 0 :(得分:2)
使用indexer
{
}
searchd
{
listen = 127.0.0.1:9306:mysql41
log = /home/myapp/log/development.searchd.log
query_log = /home/myapp/log/development.searchd.query.log
pid_file = /home/myapp/log/development.sphinx.pid
workers = threads
binlog_path = /home/myapp/tmp/binlog/development
}
index game_core
{
type = rt
path = /home/myapp/db/sphinx/development/game_core
docinfo = extern
rt_field = sphinx_internal_class_name
rt_field = name
rt_field = summary
rt_attr_uint = sphinx_deleted
rt_attr_bigint = sphinx_internal_id
rt_attr_timestamp = created_at
rt_attr_timestamp = updated_at
rt_attr_string = sphinx_internal_class
rt_attr_string = name_sort
}
index lesson_core
{
type = rt
path = /home/myapp/db/sphinx/development/lesson_core
docinfo = extern
rt_field = sphinx_internal_class_name
rt_field = name
rt_field = purpose
rt_field = meta
rt_field = supplies
rt_field = activity
rt_attr_uint = sphinx_deleted
rt_attr_bigint = sphinx_internal_id
rt_attr_timestamp = created_at
rt_attr_timestamp = updated_at
rt_attr_string = sphinx_internal_class
rt_attr_string = name_sort
}
index protocol_core
{
type = rt
path = /home/myapp/db/sphinx/development/protocol_core
docinfo = extern
rt_field = sphinx_internal_class_name
rt_field = name
rt_field = description
rt_attr_uint = sphinx_deleted
rt_attr_bigint = sphinx_internal_id
rt_attr_timestamp = created_at
rt_attr_timestamp = updated_at
rt_attr_string = sphinx_internal_class
rt_attr_string = name_sort
}
index resource_page_core
{
type = rt
path = /home/myapp/db/sphinx/development/resource_page_core
docinfo = extern
rt_field = sphinx_internal_class_name
rt_field = header
rt_field = content
rt_attr_uint = sphinx_deleted
rt_attr_bigint = sphinx_internal_id
rt_attr_timestamp = created_at
rt_attr_timestamp = updated_at
rt_attr_string = sphinx_internal_class
rt_attr_string = header_sort
}
index game
{
type = distributed
local = game_core
}
index lesson
{
type = distributed
local = lesson_core
}
index protocol
{
type = distributed
local = protocol_core
}
index resource_page
{
type = distributed
local = resource_page_core
}
的可能方法(根据我的回答here改编):
data.table
给出:
# load packages library(data.table) # create a function which calculates match-score with 'df2$Value' maxscore <- function(x, y) { m <- mapply('==', shift(x, type = 'lead', n = 0:(length(y) - 1)), y) max(rowSums(m, na.rm = TRUE)) } # calculate the match-score for each group # and filter out the other groups setDT(df1)[, score := maxscore(Value, df2$Value), by = ID ][score == max(score)][, score := NULL][]
您也可以在 ID Value
1: 1 apples
2: 1 apples
3: 1 bananas
4: 1 grapes
5: 1 mangoes
6: 1 oranges
7: 1 grapes
8: 1 apples
9: 1 grapes
- 链中使用该功能(但您仍然需要dplyr
- data.table
- 函数包...:
shift
library(dplyr)
df1 %>%
group_by(ID) %>%
mutate(m = maxscore(Value, df2$Value)) %>%
ungroup() %>%
filter(m == max(m)) %>%
select(-m)
- 函数的另一种实现(受@ doscendo的回答here启发):
maxscore
答案 1 :(得分:0)
我们可以使用标记分隔符(比如Value
)为每个ID
合并#
,然后编写一个自定义函数来比较匹配的顺序标记的数量。最后,选择已达到最大匹配的ID
数据。
library(dplyr)
# This function matches and count tokens separated by `#`
# matched_count ("a#b#c","a#e#c#d") will return 1
matched_count <- function(x, y){
x_v <- strsplit(x, split = "#")[[1]]
y_v <- strsplit(y, split = "#")[[1]]
max_len <- max(length(x_v), length(y_v))
length(x_v) <- max_len
length(y_v) <- max_len
sum(x_v==y_v,na.rm = TRUE)
}
Dataframe1 %>% group_by(ID) %>%
mutate(CompStr = paste0(Value, collapse="#")) %>% #collapse values for ID
mutate(CompStrdf2 = paste0(Dataframe2$Value, collapse="#")) %>%
mutate(max_match = matched_count(CompStr, CompStrdf2)) %>%
ungroup() %>%
filter(max_match == max(max_match)) %>%
select(ID, Value)
# ID Value
# <int> <chr>
# 1 1 apples
# 2 1 apples
# 3 1 bananas
# 4 1 grapes
# 5 1 mangoes
# 6 1 oranges
# 7 1 grapes
# 8 1 apples
# 9 1 grapes
答案 2 :(得分:0)
我建议将每个组中的Values
转换为字符串并比较它们的字符串编辑距离。 adist
- 计算字符向量之间的近似字符串距离。该距离是一个广义的Levenshtein(编辑)距离,给出了将一个字符串转换为另一个字符串所需的插入,删除和替换的最小可能加权数。
string_edit_dist <- function(vec1, vec2) {
c(adist(paste0(vec1, collapse=""), paste0(vec2, collapse="")))
}
ind <- which.min(sapply(seq_along(unique(df1$ID)), function(i) string_edit_dist(df1$Value[df1$ID==i], df2$Value)))
df1[df1$ID==ind, ]
# ID Value
# 1 1 apples
# 2 1 apples
# 3 1 bananas
# 4 1 grapes
# 5 1 mangoes
# 6 1 oranges
# 7 1 grapes
# 8 1 apples
# 9 1 grapes
以下是每个组的string_edit_distance
sapply(seq_along(unique(df1$ID)), function(i) string_edit_dist(df1$Value[df1$ID==i], df2$Value))
# 7 35 45 46 27