我有一个读取csv(200 mb)的应用程序。
var lines = File.ReadLines(file).ToList();
csv存储定价信息,其中包含约500,000条记录。 调用StoreValues时,下面的代码片段大约需要18秒。 有没有办法加快速度?
distinctMarketIds = 54个int值 行集合将有500k行,每行[0]具有即时匹配的marketId。
IEnumerable<string[]> newList = (lines.Where(t => distinctMarketIds.Contains(t.Split(',')[0]))
.Select(t => t.Split(',')));
log.Info(("Time Taken To Get Filtered Prices " + elapsed.Elapsed.TotalSeconds +" seconds."));
StoreValues(newList, file); //store the prices
log.Info(("Time Taken To Store Prices " + elapsed.Elapsed.TotalSeconds + " seconds."));
商店值方法使用Parallel.ForEach
Parallel.ForEach(finalLines, new ParallelOptions { MaxDegreeOfParallelism = MaxThreads }, (line) =>
{
});
我似乎无法找到为什么需要18秒才能完成此循环。 我已经在具有类似规格的另一台机器上进行了测试,StoreValue方法需要2.5秒
#region LoadPriceDataFromCsvFile
public int LoadPriceDataFromCsvFile(string filename, string[] marketIdList, int maxThreads)
{
MaxThreads = maxThreads;
int filteredRows = 0;
string[] files = Directory.GetFiles(filename, "*.csv");
elapsed.Start();
log.InfoFormat("Total Csv files to Scan {0}",files.Length);
Parallel.ForEach(files, new ParallelOptions { MaxDegreeOfParallelism = MaxThreads }, (file) =>
{
try
{
log.InfoFormat("About to Scan File {0}", file);
ScanCsvFilesAndGetPrices(file);
}
catch (System.OutOfMemoryException e)
{
log.Info(e);
}
catch (Exception e)
{
log.Info(e);
}
});
return PriceCollection.Count;
}
#endregion
#region ScanCsvFilesAndGetPrices
private void ScanCsvFilesAndGetPrices(string file)
{
try
{
log.Info(("Time Taken " + elapsed.Elapsed.TotalSeconds + " seconds."));
var lines = File.ReadLines(file).ToList();
log.Info(("Time Taken To Read csv " + elapsed.Elapsed.TotalSeconds + " seconds."));
if (lines.Any())
{
log.Info(("Time Taken To Read Any " + elapsed.Elapsed.TotalSeconds + " seconds."));
var firstLine = lines.ElementAt(1); // This is the First Line with Headers
log.Info(("Time Taken To Read First Line " + elapsed.Elapsed.TotalSeconds + " seconds."));
var lastLine = lines.Last(); // This is the Last line in the csv file
log.Info(("Time Taken To Read Last Line " + elapsed.Elapsed.TotalSeconds + " seconds."));
var header = lines.First().Split(',');
log.Info(("Time Taken To Split Header Line " + elapsed.Elapsed.TotalSeconds + " seconds."));
GetIndexOfFields(header);
log.Info(("Time Taken To Read Header " + elapsed.Elapsed.TotalSeconds + " seconds."));
// Get the Publish Date Time
if (PublishedDatetime_Index != -1)
{
var fLine = firstLine.Split(',');
var lLine = lastLine.Split(',');
var firstLineDatetime = (fLine[PublishedDatetime_Index].Contains("+"))? fLine[PublishedDatetime_Index].Remove(fLine[PublishedDatetime_Index].IndexOf("+",StringComparison.Ordinal)): fLine[PublishedDatetime_Index];
var publishDateTimeFirstLine =FileNameGenerator.GetCorrectTime(Convert.ToDateTime(firstLineDatetime));
string lastLineDatetime = (lLine[PublishedDatetime_Index].Contains("+"))? lLine[PublishedDatetime_Index].Remove(lLine[PublishedDatetime_Index].IndexOf("+",StringComparison.Ordinal)): lLine[PublishedDatetime_Index];
var publishDateTimeLastLine =FileNameGenerator.GetCorrectTime(Convert.ToDateTime(lastLineDatetime));
// check if the order execution date time of any order lieas between the date time of first and last line of csv so we can add that csv to our filtered list
string[] distinctMarketIds = OrderEntityColection.FindAll(obj =>obj.OrderLastChangeDateTimeUtc >= publishDateTimeFirstLine &&obj.OrderLastChangeDateTimeUtc <= publishDateTimeLastLine).Select(obj => obj.MarketId.ToString())
.Distinct()
.ToArray();
log.InfoFormat("Total Markets Identified {0}",distinctMarketIds.Length);
List<OrderEntity> foundOrdersList = OrderEntityColection.FindAll(obj =>obj.OrderLastChangeDateTimeUtc >= publishDateTimeFirstLine &&obj.OrderLastChangeDateTimeUtc <= publishDateTimeLastLine);
lock (FoundOrdersList)
{
FoundOrdersList.AddRange(foundOrdersList);
}
log.InfoFormat("Total Orders Identified {0}", FoundOrdersList.Count());
log.Info(("Time Taken To Read Execution Times and Market " + elapsed.Elapsed.TotalSeconds +" seconds."));
if (distinctMarketIds.Length != 0)
{
IEnumerable<string[]> newList =
(lines.Where(
t => distinctMarketIds.Contains(t.Split(',')[0]))
.Select(t => t.Split(','))
);
log.Info(("Time Taken To Get Filtered Prices " + elapsed.Elapsed.TotalSeconds +" seconds."));
// this is taking longer than expected. Somthing to do with IEnumerable<string[]>
StoreValues(newList, file); //store the prices
log.Info(("Time Taken To Store Prices " + elapsed.Elapsed.TotalSeconds + " seconds."));
}
}
}
}
catch (Exception e)
{
log.Info(e);
}
}
#endregion
#region GetIndexOfFields
// These are the fields we want to Look for from the headers and accordingly get their location
private void GetIndexOfFields(IEnumerable<string> lineHeader)
{
int index = 0;
foreach (var column in lineHeader)
{
if (column == "MarketId")
{
MarketId_Index= index;
}
if (column == "Bid")
{
Bid_Index = index; ;
}
if (column == "Ask")
{
Ask_Index = index;
}
if (column == "Mid")
{
Mid_Index = index;
}
if (column == "Is_Indicative")
{
Is_Indicative_Index = index;
}
if (column == "Price_Engine")
{
Price_Engine_Index = index;
}
if (column == "PublishedDatetime")
{
PublishedDatetime_Index = index;
}
if (column == "Market_Is_Open")
{
Market_Is_Open_Index = index;
}
if (column == "AuditId")
{
AuditId_Index = index;
}
if (column == "Row_Update_Version")
{
Row_Update_Version_Index = index;
}
if (column == "DontPublish")
{
DontPublish_Index = index; ;
}
index++;
}
}
#endregion
#region StoreValues
private void StoreValues(IEnumerable<string[]> finalLines, string file)
{
log.InfoFormat("total Finel Lines Sent for Storing {0}", finalLines.Count());
Parallel.ForEach(finalLines, new ParallelOptions { MaxDegreeOfParallelism = MaxThreads }, (line) =>
{
var prices = new Prices();
// the code that you want to measure comes here
var datetime = (line[PublishedDatetime_Index].Contains("+")) ? line[PublishedDatetime_Index].Remove(line[PublishedDatetime_Index].IndexOf("+", StringComparison.Ordinal)) : line[PublishedDatetime_Index];
if (!IsNullOrEmpty(datetime))
{
prices.PublishedDatetime = Convert.ToDateTime(datetime);
}
if (!IsNullOrEmpty(line[MarketId_Index]))
{
prices.MarketId = Convert.ToInt32(line[MarketId_Index]);
}
if (!IsNullOrEmpty(line[Bid_Index]))
{
prices.Bid = Convert.ToDecimal(line[Bid_Index]);
}
if (!IsNullOrEmpty(line[Ask_Index]))
{
prices.Ask = Convert.ToDecimal(line[Ask_Index]);
}
if (!IsNullOrEmpty(line[Mid_Index]))
{
prices.Mid = Convert.ToDecimal(line[Mid_Index]);
}
if (!IsNullOrEmpty(line[Is_Indicative_Index]))
{
prices.Is_Indicative = Convert.ToBoolean(line[Is_Indicative_Index]);
}
else
{
prices.Is_Indicative = false;
}
if (!IsNullOrEmpty(line[Price_Engine_Index]))
{
prices.Price_Engine = Convert.ToString(line[Price_Engine_Index]);
}
if (!IsNullOrEmpty(line[Market_Is_Open_Index]))
{
prices.Market_Is_Open = line[Market_Is_Open_Index] == "1";
}
if (!IsNullOrEmpty(line[AuditId_Index]))
{
prices.AuditId = Convert.ToString(line[AuditId_Index]);
}
if (!IsNullOrEmpty(line[Row_Update_Version_Index]))
{
prices.Row_Update_Version = Convert.ToString(line[Row_Update_Version_Index]);
}
if (!IsNullOrEmpty(line[DontPublish_Index]))
{
if (DontPublish_Index != 0)
{
prices.DontPublish = line[DontPublish_Index] == "1";
}
}
prices.SbProdFile = file;
lock (PriceCollection)
{
PriceCollection.Add(prices);
}
});
}
答案 0 :(得分:1)
Parallel.ForEach
如何帮助提高效果File.ReadLines(file).ToList()
,如果您想要内存中的所有行,请使用ReadAllLines
;如果您想要一个接一个地处理这些行,请使用ReadLines
HashSet<string>
使用distinctMarketIds
:这应该更有效:
var marketIdSet = new HashSet<string>(OrderEntityColection.FindAll(obj =>obj.OrderLastChangeDateTimeUtc >= publishDateTimeFirstLine &&obj.OrderLastChangeDateTimeUtc <= publishDateTimeLastLine).Select(obj => obj.MarketId.ToString()));
IEnumerable<string[]> allFields = File.ReadLines(file)
.Select(line => line.Split(','))
.Where(arr => marketIdSet.Contains(arr[0]));
请注意,由于Select
和Where
的延迟执行,这只是一个查询,但尚未执行。因此,无论何时使用allFields
,您都将再次执行此查询。所以创建一个集合是一个好主意,例如。将allFields.ToList()
传递给StoreValues
:
StoreValues(allFields.ToList(), file); //store the prices
如果你传递了一个集合,那么在Parallel.ForEach
中使用StoreValues
真的可以从中受益。
答案 1 :(得分:0)
static void Main()
{
//var lines = File.ReadLines(file).ToList();
// this is just a fast generation for sample data
var lines = Enumerable.Range(0, 500000)
.Select(i => string.Join(",", i % 7, i, i & 2))
.ToList();
// HashSet will work as an indexed store and will match faster in your loop
var distinctMarketIds = new HashSet<string>{
"0", "2", "3", "5"
};
// Do this if you are to use the method syntax instead of the query syntax
// var newList = lines.Select(l=>l.Split(','))
// .Where(ps=>distinctMarketIds.Contains(ps[0]));
var newList = from l in lines
// this will parse the string once versus twice as you were doing before
let ps = l.Split(',')
where distinctMarketIds.Contains(ps[0])
select ps;
// can't see the content of your `StoreValues` method but writing to a
// file in parallel will never work as expected.
using (var stream = new StreamWriter("outfile.txt"))
foreach (var l in newList)
stream.WriteLine(string.Join(";", l));
}