在LINQ中分组会更快吗?它甚至可能吗?

时间:2014-06-06 19:34:06

标签: c# linq

我有一个LINQ数据表,如下所示:

numberColumn    Value
1               3
4               1
30              6
20              10
50              5

我还有一个范围列表,看起来像这样(可以有任意数量的范围)

Ranges
    range
        lowerRangeInclusive = 0
        upperRangeExclusive = 10
        average = null
    range
        lowerRangeInclusive = 10
        upperRangeExclusive = 40
        average = null
    range
        lowerRangeInclusive = 40
        upperRangeExclusive = 100
        average = null

我需要以快速的方式计算每个numberColumn范围的平均值(我的实际数据非常大。数十万行,100多个范围)。在上面的示例中,它应如下所示:

Ranges
    range
        lowerRangeInclusive = 0
        upperRangeExclusive = 10
        average = 2
    range
        lowerRangeInclusive = 10
        upperRangeExclusive = 40
        average = 8
    range
        lowerRangeInclusive = 40
        upperRangeExclusive = 100
        average = 5

现在我只是有这样的东西,但它很慢。这可怕的未经优化,或者无论如何都会变得缓慢?:

var table = GetTable();
foreach (var range in Ranges)
{
    range.Average = table.Where(n => n.numberColumn >= range.lowerRangeInclusive &&
                              n.numberColumn < range.upperRangeExclusive)
                   .Select(x => x.Value).Average();
}

有更快的方法吗?

1 个答案:

答案 0 :(得分:0)

你没有说“非常慢”对你意味着什么。我将数据分成列表,每个列表对应一个范围。对于200个范围内的100,000个项目,在我的计算机上处​​理不到一秒钟。

LINQ可以优雅而快速地编写,但它并不总是处理数据的最快方式。

如果此代码无法帮助您理解“binning”的含义,那么Histogram可能会有所帮助。

using System;
using System.Collections;
using System.Collections.Generic;
using System.Data;
using System.Diagnostics;
using System.Linq;

static class Module1
{

    const int NUMBERCOLUMNMAX = 1000;
    const int VALUEMAX = 100;
    const int SAMPLEDATASIZE = 100000; // how many rows to populate the DataTable with.
    const int NBINS = 200; // equivalent to quantity of ranges

    static Random rand = new Random();
    static DataTable dt;

    static List<Range> ranges;

    public class Range
    {
        public int LowerRangeInclusive { get; set; }
        public int UpperRangeExclusive { get; set; }
        public double Average { get; set; }
    }

    public static DataTable GetData()
    {
        // create DataTable
        DataTable dt = new DataTable();
        DataColumn dcNum = new DataColumn
        {
            ColumnName = "numberColumn",
            DataType = Type.GetType("System.Int32")
        };
        DataColumn dcVal = new DataColumn
        {
            ColumnName = "Value",
            DataType = Type.GetType("System.Int32")
        };
        dt.Columns.Add(dcNum);
        dt.Columns.Add(dcVal);

        // populate DataTable
        for (int i = 1; i <= SAMPLEDATASIZE; i++)
        {
            DataRow dr = dt.NewRow();
            dr[0] = rand.Next(1, NUMBERCOLUMNMAX + 1);
            dr[1] = rand.Next(1, VALUEMAX + 1);
            dt.Rows.Add(dr);
        }

        return dt;

    }

    public static List<Range> GetRanges()
    {
        ranges = new List<Range>();
        int nRanges = NBINS;

        for (int i = 0; i < nRanges; i++)
        {
            Range thisRange = new Range
            {
                LowerRangeInclusive = Convert.ToInt32(Math.Floor(Convert.ToDouble(NUMBERCOLUMNMAX) * i / nRanges)),
                UpperRangeExclusive = Convert.ToInt32(Math.Floor(Convert.ToDouble(NUMBERCOLUMNMAX) * (i + 1) / nRanges))
            };
            ranges.Add(thisRange);
        }

        return ranges;

    }

    public static void SetAverages(List<Range> ranges, DataTable dt)
    {
        int nRanges = ranges.Count;
        List<int>[] bins = new List<int>[nRanges];

        for (int i = 0; i < nRanges; i++)
        {
            bins[i] = new List<int>();
        }

        int numCol = dt.Columns["numberColumn"].Ordinal;
        int valCol = dt.Columns["Value"].Ordinal;

        foreach (DataRow dr in dt.Rows)
        {
            for (int i = 0; i < nRanges; i++)
            {
                if (Convert.ToInt32(dr[numCol]) >= ranges[i].LowerRangeInclusive && Convert.ToInt32(dr[numCol]) < ranges[i].UpperRangeExclusive)
                {
                    bins[i].Add(Convert.ToInt32(dr[valCol]));
                    break;
                }
            }
        }

        //TODO: Do something meaningful in the case where ranges(i).Count == 0 instead of the average being zero.
        for (int i = 0; i < nRanges; i++)
        {
            if (bins[i].Count > 0)
            {
                ranges[i].Average = bins[i].Average();
            }
        }

    }

    public static void Main()
    {
        Console.Write("Init...");
        dt = GetData();
        ranges = GetRanges();
        Console.WriteLine("done.");

        // Show ranges
        //foreach (Range r in ranges)
        //{
        //    Console.WriteLine(String.Format("{0,4} {1,5}", r.LowerRangeInclusive, r.UpperRangeExclusive));
        //}

        // Show datarows
        //for (int i = 0; i < dt.Rows.Count; i++)
        //{
        //    Console.WriteLine("{0,4} {1,5}", dt.Rows[i][0], dt.Rows[i][1]);
        //}

        //Time it so that comparisons can be made to other methods.
        Stopwatch sw = new Stopwatch();
        sw.Start();
        SetAverages(ranges, dt);
        sw.Stop();
        Console.WriteLine(string.Format("{0} rows processed in {1} bins in {2}ms.", dt.Rows.Count, NBINS, sw.ElapsedMilliseconds));

        // Show the results
        foreach (Range r in ranges)
        {
            Console.WriteLine(string.Format("[{0},{1}): {2}", r.LowerRangeInclusive, r.UpperRangeExclusive, r.Average));
        }

        Console.ReadLine();

    }

}

P.S。我最初在VB中编写它并使用在线转换器来获取C#,因此可能有些东西可以针对C#进行优化,我不知道。