我正在从我的硬盘上的文本文件中阅读IMDB电影列表(最初可从ftp://ftp.fu-berlin.de/pub/misc/movies/database/movies.list.gz的IMDB网站获得)。
我的机器大约需要5分钟(基本信息:Win7 x64bit,16GB RAM,500 GB SATA Hardisk 7200 RPM),使用下面的代码逐行读取此文件。
我有两个问题:
有什么办法可以优化代码来改善阅读时间吗?
数据访问不需要是顺序的,因为我不介意从上到下/从下到上读取数据或任何顺序,只要它一次读取一行。我想知道有没有办法在多个方向阅读以改善阅读时间?
该应用程序是Windows控制台应用程序。
更新:许多回复都正确地指出写入控制台需要大量时间。考虑到现在可以在Windows控制台上显示数据,但不是强制性的
//代码块
string file = @"D:\movies.list";
FileStream fs = new FileStream(file, FileMode.Open, FileAccess.Read, FileShare.None, 8, FileOptions.None);
using (StreamReader sr = new StreamReader(fs))
{
while (sr.Peek() >= 0)
{
Console.WriteLine(sr.ReadLine());
}
}
答案 0 :(得分:0)
我不确定这是否更有效,但另一种方法是使用File.ReadAllLines:
var movieFile = File.ReadAllLines(file);
foreach (var movie in movieFile)
Console.WriteLine(movie);
答案 1 :(得分:0)
在.net 4中,您可以使用File.ReadLines进行延迟评估,从而在处理大型文件时降低RAM使用率。
您可以直接对文件执行linq操作,这与File.ReadLines一起可以改善加载时间。
为了更好地理解,您可以查看Read text file word-by-word using LINQ
您也可以进行比较,但也要设置时间间隔。
但是,如果您创建Web应用程序,则可以在应用程序启动事件中读取整个文件,并将它们缓存在应用程序池中以获得更好的性能。
答案 2 :(得分:0)
我不是c#开发人员,但是如何使用该文件批量插入数据库(这将是一次)。然后,您也可以重复使用数据并导出。
答案 3 :(得分:0)
这个问题的答案实际上取决于你将对数据做什么。如果你的意图真的只是读入文件并将内容转储到控制台屏幕,那么最好使用StringBuilder Class建立一个字符串,比如1000行,然后将内容转储到屏幕,重置字符串,然后读取另外1000行,转储它们等...
但是,如果您正在尝试构建属于较大项目且使用.NET 4.0的内容,则可以使用MemoryMappedFile Class来读取文件并创建CreateViewAccessor来创建“窗口”,仅对部分数据进行操作,而不是读取整个文件。
另一种选择是让线程一次性读取文件的不同部分,然后将它们全部放在一起。
如果您可以更具体地了解您打算如何处理这些数据,我可以为您提供更多帮助。希望这有帮助!
编辑:
尝试使用此代码。我能够使用Threads以3秒的时间读取整个列表:
using System;
using System.IO;
using System.Text;
using System.Threading;
namespace ConsoleApplication36
{
class Program
{
private const string FileName = @"C:\Users\Public\movies.list";
private const long ThreadReadBlockSize = 50000;
private const int NumberOfThreads = 4;
private static byte[] _inputString;
static void Main(string[] args)
{
var fi = new FileInfo(FileName);
long totalBytesRead = 0;
long fileLength = fi.Length;
long readPosition = 0L;
Console.WriteLine("Reading Lines From {0}", FileName);
var threads = new Thread[NumberOfThreads];
var instances = new ReadThread[NumberOfThreads];
_inputString = new byte[fileLength];
while (totalBytesRead < fileLength)
{
for (int i = 0; i < NumberOfThreads; i++)
{
var rt = new ReadThread { StartPosition = readPosition, BlockSize = ThreadReadBlockSize };
instances[i] = rt;
threads[i] = new Thread(rt.Read);
threads[i].Start();
readPosition += ThreadReadBlockSize;
}
for (int i = 0; i < NumberOfThreads; i++)
{
threads[i].Join();
}
for (int i = 0; i < NumberOfThreads; i++)
{
if (instances[i].BlockSize > 0)
{
Array.Copy(instances[i].Output, 0L, _inputString, instances[i].StartPosition,
instances[i].BlockSize);
totalBytesRead += instances[i].BlockSize;
}
}
}
string finalString = Encoding.ASCII.GetString(_inputString);
Console.WriteLine(finalString.Substring(104250000, 50000));
}
private class ReadThread
{
public long StartPosition { get; set; }
public long BlockSize { get; set; }
public byte[] Output { get; private set; }
public void Read()
{
Output = new byte[BlockSize];
var inStream = new FileStream(FileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite);
inStream.Seek(StartPosition, SeekOrigin.Begin);
BlockSize = inStream.Read(Output, 0, (int)BlockSize);
inStream.Close();
}
}
}
}
您需要更改FileName以匹配movies.list文件的位置。此外,您可以调整线程总数。我使用了4,但你可以随意减少或增加。您还可以更改块大小...这是每个线程读入的数据量。此外,我假设它是一个ASCII文本文件。如果不是,则需要将编码类型更改为UTF8或文件所在的任何编码。祝你好运!
答案 4 :(得分:0)
首先,如果您不关心将列表打印到控制台,请编辑您的问题。
其次,我创建了一个计时程序来测试建议的不同方法的速度:
class Program
{
private static readonly string file = @"movies.list";
private static readonly int testStart = 1;
private static readonly int numOfTests = 2;
private static readonly int MinTimingVal = 1000;
private static string[] testNames = new string[] {
"Naive",
"OneCallToWrite",
"SomeCallsToWrite",
"InParallel",
"InParallelBlcoks",
"IceManMinds",
"TestTiming"
};
private static double[] avgSecs = new double[numOfTests];
private static int[] testIterations = new int[numOfTests];
public static void Main(string[] args)
{
Console.WriteLine("Starting tests...");
Debug.WriteLine("Starting tests...");
Console.WriteLine("");
Debug.WriteLine("");
//*****************************
//The console is the bottle-neck, so we can
//speed-up redrawing it by only showing 1 line at a time.
Console.WindowHeight = 1;
Console.WindowWidth = 50;
Console.BufferHeight = 100;
Console.BufferWidth = 50;
//******************************
Action[] actionArray = new Action[numOfTests];
actionArray[0] = naive;
actionArray[1] = oneCallToWrite;
actionArray[2] = someCallsToWrite;
actionArray[3] = inParallel;
actionArray[4] = inParallelBlocks;
actionArray[5] = iceManMinds;
actionArray[6] = testTiming;
for (int i = testStart; i < actionArray.Length; i++)
{
Action a = actionArray[i];
DoTiming(a, i);
}
printResults();
Console.WriteLine("");
Debug.WriteLine("");
Console.WriteLine("Tests complete.");
Debug.WriteLine("Tests complete.");
Console.WriteLine("Press Enter to Close Console...");
Debug.WriteLine("Press Enter to Close Console...");
Console.ReadLine();
}
private static void DoTiming(Action a, int num)
{
a.Invoke();
Stopwatch watch = new Stopwatch();
Stopwatch loopWatch = new Stopwatch();
bool shouldRetry = false;
int numOfIterations = 2;
do
{
watch.Start();
for (int i = 0; i < numOfIterations; i++)
{
a.Invoke();
}
watch.Stop();
shouldRetry = false;
if (watch.ElapsedMilliseconds < MinTimingVal) //if the time was less than the minimum, increase load and re-time.
{
shouldRetry = true;
numOfIterations *= 2;
watch.Reset();
}
} while (shouldRetry);
long totalTime = watch.ElapsedMilliseconds;
double avgTime = ((double)totalTime) / (double)numOfIterations;
avgSecs[num] = avgTime / 1000.00;
testIterations[num] = numOfIterations;
}
private static void printResults()
{
Console.WriteLine("");
Debug.WriteLine("");
for (int i = testStart; i < numOfTests; i++)
{
TimeSpan t = TimeSpan.FromSeconds(avgSecs[i]);
Console.WriteLine("ElapsedTime: {0:N4}, " + "test: " + testNames[i], t.ToString() );
Debug.WriteLine("ElapsedTime: {0:N4}, " + "test: " + testNames[i], t.ToString() );
}
}
public static void naive()
{
FileStream fs = new FileStream(file, FileMode.Open, FileAccess.Read, FileShare.None, 8, FileOptions.None);
using (StreamReader sr = new StreamReader(fs))
{
while (sr.Peek() >= 0)
{
Console.WriteLine( sr.ReadLine() );
}
}
}
public static void oneCallToWrite()
{
FileStream fs = new FileStream(file, FileMode.Open, FileAccess.Read, FileShare.None, 8, FileOptions.None);
using (StreamReader sr = new StreamReader(fs))
{
StringBuilder sb = new StringBuilder();
while (sr.Peek() >= 0)
{
string s = sr.ReadLine();
sb.Append("\n" + s);
}
Console.Write(sb);
}
}
public static void someCallsToWrite()
{
FileStream fs = new FileStream(file, FileMode.Open, FileAccess.Read, FileShare.None, 8, FileOptions.None);
using (StreamReader sr = new StreamReader(fs))
{
StringBuilder sb = new StringBuilder();
int count = 0;
int mod = 10000;
while (sr.Peek() >= 0)
{
count++;
string s = sr.ReadLine();
sb.Append("\n" + s);
if (count % mod == 0)
{
Console.Write(sb);
sb = new StringBuilder();
}
}
Console.Write( sb );
}
}
public static void inParallel()
{
string[] wordsFromFile = File.ReadAllLines( file );
int length = wordsFromFile.Length;
Parallel.For( 0, length, i => {
Console.WriteLine( wordsFromFile[i] );
});
}
public static void inParallelBlocks()
{
string[] wordsFromFile = File.ReadAllLines(file);
int length = wordsFromFile.Length;
Parallel.For<StringBuilder>(0, length,
() => { return new StringBuilder(); },
(i, loopState, sb) =>
{
sb.Append("\n" + wordsFromFile[i]);
return sb;
},
(x) => { Console.Write(x); }
);
}
#region iceManMinds
public static void iceManMinds()
{
string FileName = file;
long ThreadReadBlockSize = 50000;
int NumberOfThreads = 4;
byte[] _inputString;
var fi = new FileInfo(FileName);
long totalBytesRead = 0;
long fileLength = fi.Length;
long readPosition = 0L;
Console.WriteLine("Reading Lines From {0}", FileName);
var threads = new Thread[NumberOfThreads];
var instances = new ReadThread[NumberOfThreads];
_inputString = new byte[fileLength];
while (totalBytesRead < fileLength)
{
for (int i = 0; i < NumberOfThreads; i++)
{
var rt = new ReadThread { StartPosition = readPosition, BlockSize = ThreadReadBlockSize };
instances[i] = rt;
threads[i] = new Thread(rt.Read);
threads[i].Start();
readPosition += ThreadReadBlockSize;
}
for (int i = 0; i < NumberOfThreads; i++)
{
threads[i].Join();
}
for (int i = 0; i < NumberOfThreads; i++)
{
if (instances[i].BlockSize > 0)
{
Array.Copy(instances[i].Output, 0L, _inputString, instances[i].StartPosition,
instances[i].BlockSize);
totalBytesRead += instances[i].BlockSize;
}
}
}
string finalString = Encoding.ASCII.GetString(_inputString);
Console.WriteLine(finalString);//.Substring(104250000, 50000));
}
private class ReadThread
{
public long StartPosition { get; set; }
public long BlockSize { get; set; }
public byte[] Output { get; private set; }
public void Read()
{
Output = new byte[BlockSize];
var inStream = new FileStream(file, FileMode.Open, FileAccess.Read, FileShare.ReadWrite);
inStream.Seek(StartPosition, SeekOrigin.Begin);
BlockSize = inStream.Read(Output, 0, (int)BlockSize);
inStream.Close();
}
}
#endregion
public static void testTiming()
{
Thread.Sleep(500);
}
}
每个测试都会将文件打印到控制台。
在默认控制台设置下运行时,每次测试都在5:30到6:10之间(最小值:秒)。
在考虑Console属性之后,通过使Console.WindowHeight = 1,即一次只显示一行,(你可以向上和向下滚动以查看最近的100行),我达到了速度-up。
目前,对于大多数方法,任务在2:40(Min:Sec)稍微完成。
在您的计算机上试一试,看看它是如何运作的。
有趣的是,不同的方法基本相同,OP的代码基本上是最快的。
计时代码预热代码然后运行两次并平均所需的时间,它为每种方法执行此操作。
随意尝试自己的方法并计算时间。