使用另一个字节列表/数组计算字节列表/数组中的出现次数

时间:2011-06-20 00:52:24

标签: c# .net vb.net compression

我试图计算在另一个字节序列中字节序列发生的所有时间。但是,如果已经计算了它们,则无法重复使用字节。例如给出字符串
k.k.k.k.k.k.我们假设字节序列为k.k,然后它只会发现3次而不是5次,因为它们会像[k.k].[k.k].[k.k].那样被分解,而不是像[k.[k].[k].[k].[k].k]那样它们在一圈,基本上只是向右移动。

理想情况下,我们的想法是了解压缩字典或运行时编码的外观。所以目标是获得

k.k.k.k.k.k.只有2个部分,因为(k.k.k.)是你可以拥有的最大和最好的符号。

到目前为止这是来源:

using System;
using System.Collections.Generic;
using System.Collections;
using System.Linq;
using System.Text;
using System.IO;


    static class Compression 
    {
        static int Main(string[] args)
        {

            List<byte> bytes = File.ReadAllBytes("ok.txt").ToList();
            List<List<int>> list = new List<List<int>>();

            // Starting Numbers of bytes - This can be changed manually.
            int StartingNumBytes = bytes.Count;
            for (int i = StartingNumBytes; i > 0; i--)
            {
                Console.WriteLine("i: " + i);

                for (int ii = 0; ii < bytes.Count - i; ii++)
                {
                    Console.WriteLine("ii: " + i);
                    // New pattern comes with refresh data.
                    List<byte> pattern = new List<byte>();

                    for (int iii = 0; iii < i; iii++)
                    {
                        pattern.Add(bytes[ii + iii]);
                    }



                    DisplayBinary(bytes, "red");
                    DisplayBinary(pattern, "green");

                    int matches = 0;
                   // foreach (var position in bytes.ToArray().Locate(pattern.ToArray()))
                    for (int position = 0; position < bytes.Count; position++) {
                        if (pattern.Count > (bytes.Count - position))
                        {
                            continue;
                        }


                        for (int iiii = 0; iiii < pattern.Count; iiii++)
                        {
                            if (bytes[position + iiii] != pattern[iiii])
                            {
                                //Have to use goto because C# doesn't support continue <level>
                                goto outer;
                            }

                        }

                        // If it made it this far, it has found a match.
                        matches++;
                        Console.WriteLine("Matches: " + matches + " Orig Count: " + bytes.Count + " POS: " + position);
                        if (matches > 1)
                        {
                            int numBytesToRemove = pattern.Count;
                            for (int ra = 0; ra < numBytesToRemove; ra++)
                            {
                                // Remove it at the position it was found at, once it
                                // deletes the first one, the list will shift left and you'll need to be here again.
                                bytes.RemoveAt(position);
                            }
                            DisplayBinary(bytes, "red");
                            Console.WriteLine(pattern.Count + " Bytes removed.");

                            // Since you deleted some bytes, set the position less because you will need to redo the pos.
                            position = position - 1;
                        }


                        outer:
                            continue;
                    }

                    List<int> sublist = new List<int>();
                    sublist.Add(matches);
                    sublist.Add(pattern.Count);
                    // Some sort of calculation to determine how good the symbol was
                    sublist.Add(bytes.Count-((matches * pattern.Count)-matches));
                    list.Add(sublist);

                }

            }



            Display(list);
            Console.Read();
            return 0;
        }


        static void DisplayBinary(List<byte> bytes, string color="white")
        {
            switch(color){
                case "green":
                    Console.ForegroundColor = ConsoleColor.Green;
                    break;
                case "red":
                    Console.ForegroundColor = ConsoleColor.Red;
                    break;
                default:
                    break;
            }


            for (int i=0; i<bytes.Count; i++)
            {
                if (i % 8 ==0)
                    Console.WriteLine();
                Console.Write(GetIntBinaryString(bytes[i]) + " ");
            }
            Console.WriteLine();
            Console.ResetColor();
        }
        static string GetIntBinaryString(int n)
        {
            char[] b = new char[8];
            int pos = 7;
            int i = 0;

            while (i < 8)
            {
                if ((n & (1 << i)) != 0)
                {
                    b[pos] = '1';
                }
                else
                {
                    b[pos] = '0';
                }
                pos--;
                i++;
            }
            //return new string(b).TrimStart('0');
            return new string(b);
        }

        static void Display(List<List<int>> list)
        {
            //
            // Display everything in the List.
            //
            Console.WriteLine("Elements:");
            foreach (var sublist in list)
            {
                foreach (var value in sublist)
                {
                    Console.Write("{0,4}", value);

                }
                Console.WriteLine();
            }

            //
            // Display total count.
            //
            int count = 0;
            foreach (var sublist in list)
            {
                count += sublist.Count;
            }
            Console.WriteLine("Count:");
            Console.WriteLine(count);
        }

        static public int SearchBytePattern(byte[] pattern, byte[] bytes)
        {
            int matches = 0;
            // precomputing this shaves some seconds from the loop execution
            int maxloop = bytes.Length - pattern.Length;
            for (int i = 0; i < maxloop; i++)
            {
                if (pattern[0] == bytes[i])
                {
                    bool ismatch = true;
                    for (int j = 1; j < pattern.Length; j++)
                    {
                        if (bytes[i + j] != pattern[j])
                        {
                            ismatch = false;
                            break;
                        }
                    }
                    if (ismatch)
                    {
                        matches++;
                        i += pattern.Length - 1;
                    }
                }
            }
            return matches;
        }
    }

请参考帖子获取文件的非二进制文件,这里是二进制数据: 011010110010111001101011001011100110101100101110011010110010111001101011001011100110101100101110我希望它比它的开始时小。

3 个答案:

答案 0 :(得分:6)

private static int CountOccurences(byte[] target, byte[] pattern)
{
    var targetString = BitConverter.ToString(target);
    var patternString = BitConverter.ToString(pattern);
    return new Regex(patternString).Matches(targetString).Count;
}

答案 1 :(得分:2)

使用此解决方案,您可以访问匹配的各个索引(在枚举时),或者您可以在结果上调用Count()以查看有多少匹配:

public static IEnumerable<int> Find<T>(T[] pattern, T[] sequence, bool overlap)
{
    int i = 0;
    while (i < sequence.Length - pattern.Length + 1)
    {
        if (pattern.SequenceEqual(sequence.Skip(i).Take(pattern.Length)))
        {
            yield return i;
            i += overlap ? 1 : pattern.Length;
        }
        else
        {
            i++;
        }
    }
}

使用overlap: false来解决您的问题,或overlap: true查看重叠的匹配项(如果您有兴趣的话)。

我还有其他两种方法,它们的API略有不同(以及更好的性能)here,包括一个直接处理字节流的方法。

答案 2 :(得分:1)

快速而肮脏,没有正则表达式。虽然我不确定它是否回答了问题的意图,但它应该相对较快。我想我会对正则表达式进行一些时序测试,以确定相对速度:

    private int CountOccurrences(string TestString, string TestPattern)
    {
        int PatternCount = 0;
        int SearchIndex = 0;

        if (TestPattern.Length == 0)
            throw new ApplicationException("CountOccurrences: Unable to process because TestPattern has zero length.");

        if (TestString.Length == 0)
            return 0;

        do
        {
            SearchIndex = TestString.IndexOf(TestPattern, SearchIndex);

            if (SearchIndex >= 0)
            {
                ++PatternCount;
                SearchIndex += TestPattern.Length;
            }
        }
        while ((SearchIndex >= 0) && (SearchIndex < TestString.Length));

        return PatternCount;
    }

    private void btnTest_Click(object sender, EventArgs e)
    {
        string TestString1 = "k.k.k.k.k.k.k.k.k.k.k.k";
        string TestPattern1 = "k.k";

        System.Console.WriteLine(CountOccurrences(TestString1, TestPattern1).ToString()); // outputs 6
        System.Console.WriteLine(CountOccurrences(TestString1 + ".k", TestPattern1).ToString()); // still 6
        System.Console.WriteLine(CountOccurrences(TestString1, TestPattern1 + ".").ToString()); // only 5
    }