我运行构建系统。数据方面,简化的描述是我有配置,每个配置都有0..n Builds。 现在构建生成工件,其中一些存储在服务器上。我正在做的是编写一种规则,它将每个配置构建产生的所有字节数相加,并检查它们是否过多。
目前的例程代码如下:
private void CalculateExtendedDiskUsage(IEnumerable<Configuration> allConfigurations)
{
var sw = new Stopwatch();
sw.Start();
// Lets take only confs that have been updated within last 7 days
var items = allConfigurations.AsParallel().Where(x =>
x.artifact_cleanup_type != null && x.build_cleanup_type != null &&
x.updated_date > DateTime.UtcNow.AddDays(-7)
).ToList();
using (var ctx = new LocalEntities())
{
Debug.WriteLine("Context: " + sw.Elapsed);
var allBuilds = ctx.Builds;
var ruleResult = new List<Notification>();
foreach (var configuration in items)
{
// all builds for current configuration
var configurationBuilds = allBuilds.Where(x => x.configuration_id == configuration.configuration_id)
.OrderByDescending(z => z.build_date);
Debug.WriteLine("Filter conf builds: " + sw.Elapsed);
// Since I don't know which builds/artifacts have been cleaned up, calculate it manually
if (configuration.build_cleanup_count != null)
{
var buildCleanupCount = "30"; // default
if (configuration.build_cleanup_type.Equals("ReserveBuildsByDays"))
{
var buildLastCleanupDate = DateTime.UtcNow.AddDays(-int.Parse(buildCleanupCount));
configurationBuilds = configurationBuilds.Where(x => x.build_date > buildLastCleanupDate)
.OrderByDescending(z => z.build_date);
}
if (configuration.build_cleanup_type.Equals("ReserveBuildsByCount"))
{
var buildLastCleanupCount = int.Parse(buildCleanupCount);
configurationBuilds =
configurationBuilds.Take(buildLastCleanupCount).OrderByDescending(z => z.build_date);
}
}
if (configuration.artifact_cleanup_count != null)
{
// skipped, similar to previous block
}
Debug.WriteLine("Done cleanup: " + sw.Elapsed);
const int maxDiscAllocationPerConfiguration = 1000000000; // 1GB
// Sum all disc usage per configuration
var confDiscSizePerConfiguration = configurationBuilds
.GroupBy(c => new {c.configuration_id})
.Where(c => (c.Sum(z => z.artifact_dir_size) > maxDiscAllocationPerConfiguration))
.Select(groupedBuilds =>
new
{
configurationId = groupedBuilds.FirstOrDefault().configuration_id,
configurationPath = groupedBuilds.FirstOrDefault().configuration_path,
Total = groupedBuilds.Sum(c => c.artifact_dir_size),
Average = groupedBuilds.Average(c => c.artifact_dir_size)
}).ToList();
Debug.WriteLine("Done db query: " + sw.Elapsed);
ruleResult.AddRange(confDiscSizePerConfiguration.Select(iter => new Notification
{
ConfigurationId = iter.configurationId,
CreatedDate = DateTime.UtcNow,
RuleType = (int) RulesEnum.TooMuchDisc,
ConfigrationPath = iter.configurationPath
}));
Debug.WriteLine("Finished loop: " + sw.Elapsed);
}
// find owners and insert...
}
}
这正是我想要的,但我在想,如果我能做得更快。 Currenly我看到了:
Context: 00:00:00.0609067
// first round
Filter conf builds: 00:00:00.0636291
Done cleanup: 00:00:00.0644505
Done db query: 00:00:00.3050122
Finished loop: 00:00:00.3062711
// avg round
Filter conf builds: 00:00:00.0001707
Done cleanup: 00:00:00.0006343
Done db query: 00:00:00.0760567
Finished loop: 00:00:00.0773370
SQL
looks very messy.生成的.ToList()
(WHERE
中使用的所有内容都包含在DB中的索引中)
我正在使用200种配置进行测试,因此这相当于00:00:18.6326722。我总共有大约8k项需要每天处理(所以整个例程需要10多分钟才能完成)。
我一直在谷歌上搜索这个互联网,在我看来Entitiy Framework
并不是很好的并行处理。知道我还是决定尝试这个async/await
approch(第一次尝试它,很抱歉任何废话)。
基本上,如果我将所有处理移出范围,例如:
foreach (var configuration in items)
{
var confDiscSizePerConfiguration = await GetData(configuration, allBuilds);
ruleResult.AddRange(confDiscSizePerConfiguration.Select(iter => new Notification
{
... skiped
}
和
private async Task<List<Tmp>> GetData(Configuration configuration, IQueryable<Build> allBuilds)
{
var configurationBuilds = allBuilds.Where(x => x.configuration_id == configuration.configuration_id)
.OrderByDescending(z => z.build_date);
//..skipped
var confDiscSizePerConfiguration = configurationBuilds
.GroupBy(c => new {c.configuration_id})
.Where(c => (c.Sum(z => z.artifact_dir_size) > maxDiscAllocationPerConfiguration))
.Select(groupedBuilds =>
new Tmp
{
ConfigurationId = groupedBuilds.FirstOrDefault().configuration_id,
ConfigurationPath = groupedBuilds.FirstOrDefault().configuration_path,
Total = groupedBuilds.Sum(c => c.artifact_dir_size),
Average = groupedBuilds.Average(c => c.artifact_dir_size)
}).ToListAsync();
return await confDiscSizePerConfiguration;
}
由于某种原因,这会导致200个项目的执行时间从18开始减少 - > 13秒无论如何,根据我的理解,因为我await
每个.ToListAsync()
,它仍然按顺序处理,这是正确的吗?
所以&#34;不能并行处理&#34;当我用foreach (var configuration in items)
替换Parallel.ForEach(items, async configuration =>
时,声明就会开始显现。执行此更改会导致:
第二个操作在此前一个上下文之前开始 异步操作完成。使用&#39;等待&#39;确保任何 在调用另一个方法之前已完成异步操作 在这种背景下。任何实例成员都不能保证是线程 安全
起初我有点困惑,因为我await
实际上在编译器允许的每个地方,但可能数据被快速播种。
我试图通过减少贪婪并将new ParallelOptions {MaxDegreeOfParallelism = 4}
添加到该并行循环来克服这一点,农民的假设是默认连接池大小为100,我想要使用的是4,应该是充足的。但它仍然失败。
我还尝试在GetData
方法中创建新的DbContexts,但它仍然失败。如果我没记错的话(现在不能测试),我得到了
底层连接无法打开
有什么可能使这个例程变得更快?
答案 0 :(得分:3)
在并行之前,优化查询本身是值得的。以下是一些可能会改善您的时间的建议:
1)使用Key
时使用GroupBy
。这可能会解决复杂的问题。嵌套SQL查询,就像你指示Linq使用GROUP BY
中定义的相同键而不是创建子选择一样。
var confDiscSizePerConfiguration = configurationBuilds
.GroupBy(c => new { ConfigurationId = c.configuration_id, ConfigurationPath = c.configuration_path})
.Where(c => (c.Sum(z => z.artifact_dir_size) > maxDiscAllocationPerConfiguration))
.Select(groupedBuilds =>
new
{
configurationId = groupedBuilds.Key.ConfigurationId,
configurationPath = groupedBuilds.Key.ConfigurationPath,
Total = groupedBuilds.Sum(c => c.artifact_dir_size),
Average = groupedBuilds.Average(c => c.artifact_dir_size)
})
.ToList();
2)看来你被N + 1问题咬了。简单来说 - 您执行一个SQL查询以获取所有配置,并执行另外一个SQL查询以获取构建信息。总共会有大约8k个小查询,其中2个更大的查询就足够了。如果使用的内存不是约束,则使用ToLookup
获取内存中的所有构建数据并进行优化以快速查找。
var allBuilds = ctx.Builds.ToLookup(x=>x.configuration_id);
稍后您可以通过以下方式查找构建:
var configurationBuilds = allBuilds[configuration.configuration_id].OrderByDescending(z => z.build_date);
3)您在OrderBy
多次进行configurationBuilds
次。过滤不会影响记录顺序,因此您可以安全地删除对OrderBy
的额外调用:
...
configurationBuilds = configurationBuilds.Where(x => x.build_date > buildLastCleanupDate);
...
configurationBuilds = configurationBuilds.Take(buildLastCleanupCount);
...
4)没有必要做GroupBy
因为已经针对单个配置过滤了构建。
<强> 更新: 强>
我更进了一步,创建了一个代码,可以通过单个请求检索与提供的代码相同的结果。它应该更高性能并且使用更少的内存。
private void CalculateExtendedDiskUsage()
{
using (var ctx = new LocalEntities())
{
var ruleResult = ctx.Configurations
.Where(x => x.build_cleanup_count != null &&
(
(x.build_cleanup_type == "ReserveBuildsByDays" && ctx.Builds.Where(y => y.configuration_id == x.configuration_id).Where(y => y.build_date > buildLastCleanupDate).Sum(y => y.artifact_dir_size) > maxDiscAllocationPerConfiguration) ||
(x.build_cleanup_type == "ReserveBuildsByCount" && ctx.Builds.Where(y => y.configuration_id == x.configuration_id).OrderByDescending(y => y.build_date).Take(buildCleanupCount).Sum(y => y.artifact_dir_size) > maxDiscAllocationPerConfiguration)
)
)
.Select(x => new Notification
{
ConfigurationId = x.configuration_id,
ConfigrationPath = x.configuration_path
CreatedDate = DateTime.UtcNow,
RuleType = (int)RulesEnum.TooMuchDisc,
})
.ToList();
}
}
答案 1 :(得分:0)
首先在每个平行线上创建一个新的上下文。你们前往那条路线。但是你需要编写一个查询,在一次旅行中获取所有需要的数据。为了加快速度,您还可以在阅读数据时禁用上下文中的更改跟踪或代理。
答案 2 :(得分:0)
有很多优化的地方......
有些地方你应该放.ToArray()以避免多次询问服务器......
我做了很多重构,但由于缺乏更多信息,我无法检查。
也许这可以引导您找到更好的解决方案......
private void CalculateExtendedDiskUsage(IEnumerable allConfigurations) { var sw = new秒表(); sw.Start();
using (var ctx = new LocalEntities())
{
Debug.WriteLine("Context: " + sw.Elapsed);
var allBuilds = ctx.Builds;
var ruleResult = GetRulesResult(sw, allConfigurations, allBuilds); // Clean Code!!!
// find owners and insert...
}
}
private static IEnumerable<Notification> GetRulesResult(Stopwatch sw, IEnumerable<Configuration> allConfigurations, ICollection<Configuration> allBuilds)
{
// Lets take only confs that have been updated within last 7 days
var ruleResult = allConfigurations
.AsParallel() // Check if you really need this right here...
.Where(IsConfigElegible) // Clean Code!!!
.SelectMany(x => CreateNotifications(sw, allBuilds, x))
.ToArray();
Debug.WriteLine("Finished loop: " + sw.Elapsed);
return ruleResult;
}
private static bool IsConfigElegible(Configuration x)
{
return x.artifact_cleanup_type != null &&
x.build_cleanup_type != null &&
x.updated_date > DateTime.UtcNow.AddDays(-7);
}
private static IEnumerable<Notification> CreateNotifications(Stopwatch sw, IEnumerable<Configuration> allBuilds, Configuration configuration)
{
// all builds for current configuration
var configurationBuilds = allBuilds
.Where(x => x.configuration_id == configuration.configuration_id);
// .OrderByDescending(z => z.build_date); <<< You should order only when needed (most at the end)
Debug.WriteLine("Filter conf builds: " + sw.Elapsed);
configurationBuilds = BuildCleanup(configuration, configurationBuilds); // Clean Code!!!
configurationBuilds = ArtifactCleanup(configuration, configurationBuilds); // Clean Code!!!
Debug.WriteLine("Done cleanup: " + sw.Elapsed);
const int maxDiscAllocationPerConfiguration = 1000000000; // 1GB
// Sum all disc usage per configuration
var confDiscSizePerConfiguration = configurationBuilds
.OrderByDescending(z => z.build_date) // I think that you can put this even later (or not to have anyway)
.GroupBy(c => c.configuration_id) // No need to create a new object, just use the property
.Where(c => (c.Sum(z => z.artifact_dir_size) > maxDiscAllocationPerConfiguration))
.Select(CreateSumPerConfiguration);
Debug.WriteLine("Done db query: " + sw.Elapsed);
// Extracting to variable to be able to return it as function result
var notifications = confDiscSizePerConfiguration
.Select(CreateNotification);
return notifications;
}
private static IEnumerable<Configuration> BuildCleanup(Configuration configuration, IEnumerable<Configuration> builds)
{
// Since I don't know which builds/artifacts have been cleaned up, calculate it manually
if (configuration.build_cleanup_count == null) return builds;
const int buildCleanupCount = 30; // Why 'string' if you always need as integer?
builds = GetDiscartBelow(configuration, buildCleanupCount, builds); // Clean Code (almost)
builds = GetDiscartAbove(configuration, buildCleanupCount, builds); // Clean Code (almost)
return builds;
}
private static IEnumerable<Configuration> ArtifactCleanup(Configuration configuration, IEnumerable<Configuration> configurationBuilds)
{
if (configuration.artifact_cleanup_count != null)
{
// skipped, similar to previous block
}
return configurationBuilds;
}
private static SumPerConfiguration CreateSumPerConfiguration(IGrouping<object, Configuration> groupedBuilds)
{
var configuration = groupedBuilds.First();
return new SumPerConfiguration
{
configurationId = configuration.configuration_id,
configurationPath = configuration.configuration_path,
Total = groupedBuilds.Sum(c => c.artifact_dir_size),
Average = groupedBuilds.Average(c => c.artifact_dir_size)
};
}
private static IEnumerable<Configuration> GetDiscartBelow(Configuration configuration,
int buildCleanupCount,
IEnumerable<Configuration> configurationBuilds)
{
if (!configuration.build_cleanup_type.Equals("ReserveBuildsByDays"))
return configurationBuilds;
var buildLastCleanupDate = DateTime.UtcNow.AddDays(-buildCleanupCount);
var result = configurationBuilds
.Where(x => x.build_date > buildLastCleanupDate);
return result;
}
private static IEnumerable<Configuration> GetDiscartAbove(Configuration configuration,
int buildLastCleanupCount,
IEnumerable<Configuration> configurationBuilds)
{
if (!configuration.build_cleanup_type.Equals("ReserveBuildsByCount"))
return configurationBuilds;
var result = configurationBuilds
.Take(buildLastCleanupCount);
return result;
}
private static Notification CreateNotification(SumPerConfiguration iter)
{
return new Notification
{
ConfigurationId = iter.configurationId,
CreatedDate = DateTime.UtcNow,
RuleType = (int)RulesEnum.TooMuchDisc,
ConfigrationPath = iter.configurationPath
};
}
}
internal class SumPerConfiguration {
public object configurationId { get; set; } //
public object configurationPath { get; set; } // I did use 'object' cause I don't know your type data
public int Total { get; set; }
public double Average { get; set; }
}