我有一个LINQ数据表,如下所示:
numberColumn Value
1 3
4 1
30 6
20 10
50 5
我还有一个范围列表,看起来像这样(可以有任意数量的范围)
Ranges
range
lowerRangeInclusive = 0
upperRangeExclusive = 10
average = null
range
lowerRangeInclusive = 10
upperRangeExclusive = 40
average = null
range
lowerRangeInclusive = 40
upperRangeExclusive = 100
average = null
我需要以快速的方式计算每个numberColumn范围的平均值(我的实际数据非常大。数十万行,100多个范围)。在上面的示例中,它应如下所示:
Ranges
range
lowerRangeInclusive = 0
upperRangeExclusive = 10
average = 2
range
lowerRangeInclusive = 10
upperRangeExclusive = 40
average = 8
range
lowerRangeInclusive = 40
upperRangeExclusive = 100
average = 5
现在我只是有这样的东西,但它很慢。这可怕的未经优化,或者无论如何都会变得缓慢?:
var table = GetTable();
foreach (var range in Ranges)
{
range.Average = table.Where(n => n.numberColumn >= range.lowerRangeInclusive &&
n.numberColumn < range.upperRangeExclusive)
.Select(x => x.Value).Average();
}
有更快的方法吗?
答案 0 :(得分:0)
你没有说“非常慢”对你意味着什么。我将数据分成列表,每个列表对应一个范围。对于200个范围内的100,000个项目,在我的计算机上处理不到一秒钟。
LINQ可以优雅而快速地编写,但它并不总是处理数据的最快方式。
如果此代码无法帮助您理解“binning”的含义,那么Histogram可能会有所帮助。
using System;
using System.Collections;
using System.Collections.Generic;
using System.Data;
using System.Diagnostics;
using System.Linq;
static class Module1
{
const int NUMBERCOLUMNMAX = 1000;
const int VALUEMAX = 100;
const int SAMPLEDATASIZE = 100000; // how many rows to populate the DataTable with.
const int NBINS = 200; // equivalent to quantity of ranges
static Random rand = new Random();
static DataTable dt;
static List<Range> ranges;
public class Range
{
public int LowerRangeInclusive { get; set; }
public int UpperRangeExclusive { get; set; }
public double Average { get; set; }
}
public static DataTable GetData()
{
// create DataTable
DataTable dt = new DataTable();
DataColumn dcNum = new DataColumn
{
ColumnName = "numberColumn",
DataType = Type.GetType("System.Int32")
};
DataColumn dcVal = new DataColumn
{
ColumnName = "Value",
DataType = Type.GetType("System.Int32")
};
dt.Columns.Add(dcNum);
dt.Columns.Add(dcVal);
// populate DataTable
for (int i = 1; i <= SAMPLEDATASIZE; i++)
{
DataRow dr = dt.NewRow();
dr[0] = rand.Next(1, NUMBERCOLUMNMAX + 1);
dr[1] = rand.Next(1, VALUEMAX + 1);
dt.Rows.Add(dr);
}
return dt;
}
public static List<Range> GetRanges()
{
ranges = new List<Range>();
int nRanges = NBINS;
for (int i = 0; i < nRanges; i++)
{
Range thisRange = new Range
{
LowerRangeInclusive = Convert.ToInt32(Math.Floor(Convert.ToDouble(NUMBERCOLUMNMAX) * i / nRanges)),
UpperRangeExclusive = Convert.ToInt32(Math.Floor(Convert.ToDouble(NUMBERCOLUMNMAX) * (i + 1) / nRanges))
};
ranges.Add(thisRange);
}
return ranges;
}
public static void SetAverages(List<Range> ranges, DataTable dt)
{
int nRanges = ranges.Count;
List<int>[] bins = new List<int>[nRanges];
for (int i = 0; i < nRanges; i++)
{
bins[i] = new List<int>();
}
int numCol = dt.Columns["numberColumn"].Ordinal;
int valCol = dt.Columns["Value"].Ordinal;
foreach (DataRow dr in dt.Rows)
{
for (int i = 0; i < nRanges; i++)
{
if (Convert.ToInt32(dr[numCol]) >= ranges[i].LowerRangeInclusive && Convert.ToInt32(dr[numCol]) < ranges[i].UpperRangeExclusive)
{
bins[i].Add(Convert.ToInt32(dr[valCol]));
break;
}
}
}
//TODO: Do something meaningful in the case where ranges(i).Count == 0 instead of the average being zero.
for (int i = 0; i < nRanges; i++)
{
if (bins[i].Count > 0)
{
ranges[i].Average = bins[i].Average();
}
}
}
public static void Main()
{
Console.Write("Init...");
dt = GetData();
ranges = GetRanges();
Console.WriteLine("done.");
// Show ranges
//foreach (Range r in ranges)
//{
// Console.WriteLine(String.Format("{0,4} {1,5}", r.LowerRangeInclusive, r.UpperRangeExclusive));
//}
// Show datarows
//for (int i = 0; i < dt.Rows.Count; i++)
//{
// Console.WriteLine("{0,4} {1,5}", dt.Rows[i][0], dt.Rows[i][1]);
//}
//Time it so that comparisons can be made to other methods.
Stopwatch sw = new Stopwatch();
sw.Start();
SetAverages(ranges, dt);
sw.Stop();
Console.WriteLine(string.Format("{0} rows processed in {1} bins in {2}ms.", dt.Rows.Count, NBINS, sw.ElapsedMilliseconds));
// Show the results
foreach (Range r in ranges)
{
Console.WriteLine(string.Format("[{0},{1}): {2}", r.LowerRangeInclusive, r.UpperRangeExclusive, r.Average));
}
Console.ReadLine();
}
}
P.S。我最初在VB中编写它并使用在线转换器来获取C#,因此可能有些东西可以针对C#进行优化,我不知道。