Question

请考虑以下示例数据框：

> df
   id name time
1   1    b   10
2   1    b   12
3   1    a    0
4   2    a    5
5   2    b   11
6   2    a    9
7   2    b    7
8   1    a   15
9   2    b    1
10  1    a    3

df = structure(list(id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L), 
    name = c("b", "b", "a", "a", "b", "a", "b", "a", "b", "a"
    ), time = c(10L, 12L, 0L, 5L, 11L, 9L, 7L, 15L, 1L, 3L)), .Names = c("id", 
"name", "time"), row.names = c(NA, -10L), class = "data.frame")

我需要为每个id识别并记录所有序列seq <- c("a","b")，其中“a”在“b”之前，基于“time”列。 “a”和“b”之间不允许使用其他名称。实际序列长度至少为5。样本数据的预期结果是

有一个类似的问题Finding rows in R dataframe where a column value follows a sequence。但是，在我的案例中，我不清楚如何处理“id”列。这是使用“dplyr”解决问题的方法吗？

Answer 1

library(dplyr); library(tidyr)

# sort data frame by id and time
df %>% arrange(id, time) %>% group_by(id) %>% 

       # get logical vector indicating rows of a followed by b and mark each pair as unique
       # by cumsum
       mutate(ab = name == "a" & lead(name) == "b", g = cumsum(ab)) %>% 

       # subset rows where conditions are met
       filter(ab | lag(ab)) %>% 

       # reshape your data frame to wide format
       select(-ab) %>% spread(name, time)


#Source: local data frame [3 x 4]
#Groups: id [2]

#     id     g     a     b
#* <int> <int> <int> <int>
#1     1     1     3    10
#2     2     1     5     7
#3     2     2     9    11

如果序列的长度大于2，那么您将需要检查多个滞后，其中一个选项是使用shift函数（它接受向量作为滞后/超前步骤）来自{{ 1}}与data.table结合使用，假设我们需要检查模式Reduce：

abb

Answer 2

您可以ifelse filter lag和lead使用tidyr::spread，然后library(tidyverse) df %>% arrange(id, time) %>% group_by(id) %>% filter(ifelse(name == 'b', # if name is b... lag(name) == 'a', # is the previous name a? lead(name) == 'b')) %>% # else if name is not b, is next name b? ungroup() %>% mutate(i = rep(seq(n() / 2), each = 2)) %>% # create indices to spread by spread(name, time) %>% select(a, b) # spread to wide and clean up ## # A tibble: 3 × 2 ## a b ## * <int> <int> ## 1 3 10 ## 2 5 7 ## 3 9 11重塑广角：

gregexpr

根据下面的评论，这里有一个使用"aabb"来查找匹配模式的第一个索引的版本，虽然更复杂，但更容易扩展到更长的模式，如df %>% group_by(pattern = 'aabb', id) %>% # add pattern as column, group arrange(time) %>% # collapse each group to a string for name and a list column for time summarise(name = paste(name, collapse = ''), time = list(time)) %>% # group and add list-column of start indices for each match rowwise() %>% mutate(i = gregexpr(pattern, name)) %>% unnest(i, .drop = FALSE) %>% # expand, keeping other list columns filter(i != -1) %>% # chop out rows with no match from gregexpr rowwise() %>% # regroup # subset with sequence from index through pattern length mutate(time = list(time[i + 0:(nchar(pattern) - 1)]), pattern = strsplit(pattern, '')) %>% # expand pattern to list column rownames_to_column('match') %>% # add rownames as match index column unnest(pattern, time) %>% # expand matches in parallel # paste sequence onto each letter (important for spreading if repeated letters) group_by(match) %>% mutate(pattern = paste0(pattern, seq(n()))) %>% spread(pattern, time) # spread to wide form ## Source: local data frame [1 x 8] ## Groups: match [1] ## ## match id name i a1 a2 b3 b4 ## * <chr> <int> <chr> <int> <int> <int> <int> <int> ## 1 1 1 aabba 1 0 3 10 12：

select(1:4, parse_number(names(.)[-1:-4]) + 4)

请注意，如果模式不是按字母顺序排列，则结果列不会按其索引排序。但是，由于保留了索引，因此您可以使用library(data.table) setorder(setDT(df), id, time) df[ name == "b" ][ df[, if(name == "a") .(time = last(time)), by=.(id, name, r = rleid(id,name))], on = .(id, time), roll = -Inf, nomatch = 0, .(a = i.time, b = x.time) ] a b 1: 3 10 2: 5 7 3: 9 11。

之类的内容进行排序

Answer 3

这有点令人费解，但滚动加入怎么样？

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Xml;
using System.Xml.Linq;

namespace ConsoleApplication1
{
    class Program
    {
        const string INPUT_FILENAME = @"c:\temp\test1.xml";
        const string OUTPUT_FILENAME = @"c:\temp\test2.xml";
        static void Main(string[] args)
        {
            XmlReader reader = XmlReader.Create(INPUT_FILENAME);
            XmlWriter writer = XmlWriter.Create(OUTPUT_FILENAME);

            writer.WriteStartElement("resultset");
            while (!reader.EOF)
            {
                if (reader.Name != "result")
                {
                    reader.ReadToFollowing("result");
                }
                if (!reader.EOF)
                {
                    XElement result = (XElement)XElement.ReadFrom(reader);
                    result.Element("new_categoria").Name = "org_category";
                    result.Element("new_name").Name = "org_name";
                    result.Element("new_tipodecampanaid").Name = "org_campaignid";

                    writer.WriteRaw(result.ToString());
                }
            }
            writer.WriteEndElement();
            writer.Flush();
            writer.Close();

        }
    }
}

R - 按数据框

3 个答案: