在字符串中查找连续标记

时间:2014-08-13 08:30:44

标签: c# xml regex parsing

<NNP>This is</NNP><NNP>a</NNP><NNP>test</NNP><VBD>application</VBD><NN>to </NN><NNP>find</NNP><NN>Consecutive</NN><NN>tags content</NN><IN>tags may vary.</IN> <NNP>using </NNP><NNP>C#</NNP>

期望的输出:

This is a test
Consecutive tags content
using C#

我想使用c#(regex)从字符串中找到连续的标签内容。

1 个答案:

答案 0 :(得分:0)

您可以使用Linq-To-XML。但是您需要首先修复具有多个根元素的无效xml:

string xml = "<NNP>This is</NNP><NNP>a</NNP><NNP>test</NNP><VBD>application</VBD><NN>to </NN><NNP>find</NNP><NN>Consecutive</NN><NN>tags content</NN><IN>tags may vary.</IN> <NNP>using </NNP><NNP>C#</NNP>";
xml = string.Format("<root>{0}</root>", xml); // wrapper for illegal root

现在您可以使用XElement.Parse

XElement xelement = XElement.Parse(xml);

我使用Microsofts implementation个连续分组(最后添加了扩展名)。

然后这个简洁的查询起作用:

var groups = xelement.Elements()
    .ChunkBy(x => x.Name.LocalName)
    .Where(g => g.Count() > 1)
    .OrderByDescending(g => g.Count())
    .Select(g => new{
            Tag = g.Key,
            Value = string.Join(" ", g.Select(x => x.Value)),
            Count = g.Count()
    });

快速测试:

foreach (var group in groups)
    Console.WriteLine("Tag: {0} Count: {1} Value: {2}",
        group.Tag, group.Count, group.Value);

输出:

Tag: NNP Count: 3 Value: This is a test
Tag: NN  Count: 2 Value: Consecutive tags content
Tag: NNP Count: 2 Value: using  C#

连续分组扩展from MS

public static class ConsecutiveGrouping
{
    public static IEnumerable<IGrouping<TKey, TSource>> ChunkBy<TSource, TKey>(this IEnumerable<TSource> source, Func<TSource, TKey> keySelector)
    {
        return source.ChunkBy(keySelector, EqualityComparer<TKey>.Default);
    }

    public static IEnumerable<IGrouping<TKey, TSource>> ChunkBy<TSource, TKey>(this IEnumerable<TSource> source, Func<TSource, TKey> keySelector, IEqualityComparer<TKey> comparer)
    {
        // Flag to signal end of source sequence. 
        const bool noMoreSourceElements = true;

        // Auto-generated iterator for the source array.        
        var enumerator = source.GetEnumerator();

        // Move to the first element in the source sequence. 
        if (!enumerator.MoveNext()) yield break;

        // Iterate through source sequence and create a copy of each Chunk. 
        // On each pass, the iterator advances to the first element of the next "Chunk" 
        // in the source sequence. This loop corresponds to the outer foreach loop that 
        // executes the query.
        Chunk<TKey, TSource> current = null;
        while (true)
        {
            // Get the key for the current Chunk. The source iterator will churn through 
            // the source sequence until it finds an element with a key that doesn't match. 
            var key = keySelector(enumerator.Current);

            // Make a new Chunk (group) object that initially has one GroupItem, which is a copy of the current source element.
            current = new Chunk<TKey, TSource>(key, enumerator, value => comparer.Equals(key, keySelector(value)));

            // Return the Chunk. A Chunk is an IGrouping<TKey,TSource>, which is the return value of the ChunkBy method. 
            // At this point the Chunk only has the first element in its source sequence. The remaining elements will be 
            // returned only when the client code foreach's over this chunk. See Chunk.GetEnumerator for more info. 
            yield return current;

            // Check to see whether (a) the chunk has made a copy of all its source elements or  
            // (b) the iterator has reached the end of the source sequence. If the caller uses an inner 
            // foreach loop to iterate the chunk items, and that loop ran to completion, 
            // then the Chunk.GetEnumerator method will already have made 
            // copies of all chunk items before we get here. If the Chunk.GetEnumerator loop did not 
            // enumerate all elements in the chunk, we need to do it here to avoid corrupting the iterator 
            // for clients that may be calling us on a separate thread. 
            if (current.CopyAllChunkElements() == noMoreSourceElements)
            {
                yield break;
            }
        }
    }

    // A Chunk is a contiguous group of one or more source elements that have the same key. A Chunk  
    // has a key and a list of ChunkItem objects, which are copies of the elements in the source sequence. 
    class Chunk<TKey, TSource> : IGrouping<TKey, TSource>
    {
        // INVARIANT: DoneCopyingChunk == true ||  
        //   (predicate != null && predicate(enumerator.Current) && current.Value == enumerator.Current) 

        // A Chunk has a linked list of ChunkItems, which represent the elements in the current chunk. Each ChunkItem 
        // has a reference to the next ChunkItem in the list. 
        class ChunkItem
        {
            public ChunkItem(TSource value)
            {
                Value = value;
            }
            public readonly TSource Value;
            public ChunkItem Next = null;
        }
        // The value that is used to determine matching elements 
        private readonly TKey key;

        // Stores a reference to the enumerator for the source sequence 
        private IEnumerator<TSource> enumerator;

        // A reference to the predicate that is used to compare keys. 
        private Func<TSource, bool> predicate;

        // Stores the contents of the first source element that 
        // belongs with this chunk. 
        private readonly ChunkItem head;

        // End of the list. It is repositioned each time a new 
        // ChunkItem is added. 
        private ChunkItem tail;

        // Flag to indicate the source iterator has reached the end of the source sequence. 
        internal bool isLastSourceElement = false;

        // Private object for thread syncronization 
        private object m_Lock;

        // REQUIRES: enumerator != null && predicate != null 
        public Chunk(TKey key, IEnumerator<TSource> enumerator, Func<TSource, bool> predicate)
        {
            this.key = key;
            this.enumerator = enumerator;
            this.predicate = predicate;

            // A Chunk always contains at least one element.
            head = new ChunkItem(enumerator.Current);

            // The end and beginning are the same until the list contains > 1 elements.
            tail = head;

            m_Lock = new object();
        }

        // Indicates that all chunk elements have been copied to the list of ChunkItems,  
        // and the source enumerator is either at the end, or else on an element with a new key. 
        // the tail of the linked list is set to null in the CopyNextChunkElement method if the 
        // key of the next element does not match the current chunk's key, or there are no more elements in the source. 
        private bool DoneCopyingChunk { get { return tail == null; } }

        // Adds one ChunkItem to the current group 
        // REQUIRES: !DoneCopyingChunk && lock(this) 
        private void CopyNextChunkElement()
        {
            // Try to advance the iterator on the source sequence. 
            // If MoveNext returns false we are at the end, and isLastSourceElement is set to true
            isLastSourceElement = !enumerator.MoveNext();

            // If we are (a) at the end of the source, or (b) at the end of the current chunk 
            // then null out the enumerator and predicate for reuse with the next chunk. 
            if (isLastSourceElement || !predicate(enumerator.Current))
            {
                enumerator = null;
                predicate = null;
            }
            else
            {
                tail.Next = new ChunkItem(enumerator.Current);
            }

            // tail will be null if we are at the end of the chunk elements 
            // This check is made in DoneCopyingChunk.
            tail = tail.Next;
        }

        // Called after the end of the last chunk was reached. It first checks whether 
        // there are more elements in the source sequence. If there are, it  
        // Returns true if enumerator for this chunk was exhausted. 
        internal bool CopyAllChunkElements()
        {
            while (true)
            {
                lock (m_Lock)
                {
                    if (DoneCopyingChunk)
                    {
                        // If isLastSourceElement is false, 
                        // it signals to the outer iterator 
                        // to continue iterating. 
                        return isLastSourceElement;
                    }
                    else
                    {
                        CopyNextChunkElement();
                    }
                }
            }
        }

        public TKey Key { get { return key; } }

        // Invoked by the inner foreach loop. This method stays just one step ahead 
        // of the client requests. It adds the next element of the chunk only after 
        // the clients requests the last element in the list so far. 
        public IEnumerator<TSource> GetEnumerator()
        {
            //Specify the initial element to enumerate.
            ChunkItem current = head;

            // There should always be at least one ChunkItem in a Chunk. 
            while (current != null)
            {
                // Yield the current item in the list. 
                yield return current.Value;

                // Copy the next item from the source sequence,  
                // if we are at the end of our local list. 
                lock (m_Lock)
                {
                    if (current == tail)
                    {
                        CopyNextChunkElement();
                    }
                }

                // Move to the next ChunkItem in the list.
                current = current.Next;
            }
        }

        System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator()
        {
            return GetEnumerator();
        }
    }
}