我们有一个递归函数,用于将记录批量插入Cosmos。 在运行长时间迁移以插入许多记录时,似乎有些记录无法插入,但是我们无法从日志记录中查明原因。 我的假设是某种状态码在应有的状态下或在出现静默故障时不会重试。
任何人都可以告知导致记录失败而没有被记录/重试的原因吗?
AllowBulkExecution = true
是在创建传入下面代码的容器时设置的。
public async Task<bool> TryBulkInsertAsync<T>(List<T> audits, int retryAttempts, TimeSpan retryDelay, int currentAttempt = 0, Container container = null) where T : BaseCosmosModel
{
if (currentAttempt > retryAttempts)
{
_logger.LogError($"Failed number of max retries ${retryAttempts}");
return false;
}
if (container == null)
{
container = _cosmosContainerFactory.BuildCosmosContainer<T>();
}
var attemptAudits = audits.Select(a => new CosmosInsertAttempt<T>
{
Audit = a,
RunningTask = container.CreateItemAsync(a)
}).ToList();
try
{
await Task.WhenAll(attemptAudits.Select(a => a.RunningTask));
var failedAudits = attemptAudits.Where(a =>
a.RunningTask.Result.StatusCode == HttpStatusCode.TooManyRequests ||
a.RunningTask.Result.StatusCode == HttpStatusCode.PreconditionFailed ||
a.RunningTask.Result.StatusCode == HttpStatusCode.RequestTimeout ||
a.RunningTask.Result.StatusCode == HttpStatusCode.ServiceUnavailable
).Select(a => a.Audit).ToList();
var nonRetryableAudits = attemptAudits.Where(a =>
!failedAudits.Contains(a.Audit) && (a.RunningTask.Result.StatusCode < (HttpStatusCode) 200 ||
a.RunningTask.Result.StatusCode > (HttpStatusCode) 299));
foreach (var audit in nonRetryableAudits)
{
_logger.LogError("Audit failed to bulk insert with non-retryable status code {cosmosAuditInsert}", audit.Audit);
}
if (failedAudits.Count > 0)
{
_logger.LogError("Retrying bulk insert from incorrect status code but no error, count: {retrySize}", failedAudits.Count);
return await TryBulkInsertAsync(failedAudits, retryAttempts, retryDelay, currentAttempt + 1, container);
}
return true;
}
catch (Exception)
{
await Task.Delay(retryDelay);
var failedAuditsWithException = attemptAudits
.Where(a => a.RunningTask.Exception != null)
.Select(a => a.Audit).ToList();
var failedAuditsWithBadStatusCode = attemptAudits.Where(a =>
(
a.RunningTask.Exception == null && a.RunningTask.Result != null &&
(a.RunningTask.Result.StatusCode == HttpStatusCode.TooManyRequests ||
a.RunningTask.Result.StatusCode == HttpStatusCode.PreconditionFailed ||
a.RunningTask.Result.StatusCode == HttpStatusCode.RequestTimeout ||
a.RunningTask.Result.StatusCode == HttpStatusCode.ServiceUnavailable)
)).Select(a => a.Audit).ToList();
if (failedAuditsWithBadStatusCode.Any())
{
_logger.LogError("Retrying bulk insert from incorrect status code but no error (some have exceptions), count: {retrySize}", failedAuditsWithBadStatusCode.Count);
}
var failedAudits = failedAuditsWithException.Concat(failedAuditsWithBadStatusCode).ToList();
var nonRetryableAudits = attemptAudits.Where(a =>
!failedAudits.Contains(a.Audit) && a.RunningTask.Exception == null && (a.RunningTask.Result.StatusCode < (HttpStatusCode)200 ||
a.RunningTask.Result.StatusCode > (HttpStatusCode)299));
foreach (var audit in nonRetryableAudits)
{
_logger.LogError("Audit failed to bulk insert with non-retryable status code {cosmosAuditInsert}", audit.Audit);
}
if (failedAudits.Count > 0)
{
_logger.LogError("Retrying bulk insert {failedAuditCount}", failedAudits.Count);
return await TryBulkInsertAsync(failedAudits, retryAttempts, retryDelay, currentAttempt + 1, container);
}
return true;
}
}