我有一个500.000.000行的文件。
这些行是最多10个字符的字符串。
如何使用多线程和批量100来处理此文件?
答案 0 :(得分:2)
使用MoreLinq的>>> L = list(itertools.product(range(1, N+1), repeat=K))
>>> L.sort(key=lambda t: len(set(t)))
>>> L
[(1, 1), (2, 2), (3, 3), (1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)]
方法,这将创建一个Batch
的集合,其中包含100行的批量大小,它将为每100行旋转一个新任务。
这是一个基本的实现,使用IEnumerable<string>
在任何给定时间只运行一定数量的任务,并且看到{500}的性能开销Semaphore
可能是明智的线。
File.ReadAllLines
答案 1 :(得分:1)
如果您使用内置TPL中的Parallel.ForEach
并编写几个枚举器(如下所列),则不需要使用其他库。您的代码可能如下所示:
using (var input = new StreamReader(File.OpenRead(@"c:\path\to\my\file.txt")))
{
Parallel.ForEach(
input.ReadLines().TakeChunks(100),
new ParallelOptions() { MaxDegreeOfParallelism = 8 /* better be number of CPU cores */ },
batchOfLines => {
DoMyProcessing(batchOfLines);
});
}
要使其工作,您需要IEnumerable<T>
上的几个扩展方法和几个枚举器,定义如下:
public static class EnumerableExtensions
{
public static IEnumerable<string> ReadLines(this StreamReader input)
{
return new LineReadingEnumerable(input);
}
public static IEnumerable<IReadOnlyList<T>> TakeChunks<T>(this IEnumerable<T> source, int length)
{
return new ChunkingEnumerable<T>(source, length);
}
public class LineReadingEnumerable : IEnumerable<string>
{
private readonly StreamReader _input;
public LineReadingEnumerable(StreamReader input)
{
_input = input;
}
public IEnumerator<string> GetEnumerator()
{
return new LineReadingEnumerator(_input);
}
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
}
public class LineReadingEnumerator : IEnumerator<string>
{
private readonly StreamReader _input;
private string _current;
public LineReadingEnumerator(StreamReader input)
{
_input = input;
}
public void Dispose()
{
_input.Dispose();
}
public bool MoveNext()
{
_current = _input.ReadLine();
return (_current != null);
}
public void Reset()
{
throw new NotSupportedException();
}
public string Current
{
get { return _current; }
}
object IEnumerator.Current
{
get { return _current; }
}
}
public class ChunkingEnumerable<T> : IEnumerable<IReadOnlyList<T>>
{
private readonly IEnumerable<T> _inner;
private readonly int _length;
public ChunkingEnumerable(IEnumerable<T> inner, int length)
{
_inner = inner;
_length = length;
}
public IEnumerator<IReadOnlyList<T>> GetEnumerator()
{
return new ChunkingEnumerator<T>(_inner.GetEnumerator(), _length);
}
IEnumerator IEnumerable.GetEnumerator()
{
return this.GetEnumerator();
}
}
public class ChunkingEnumerator<T> : IEnumerator<IReadOnlyList<T>>
{
private readonly IEnumerator<T> _inner;
private readonly int _length;
private IReadOnlyList<T> _current;
private bool _endOfInner;
public ChunkingEnumerator(IEnumerator<T> inner, int length)
{
_inner = inner;
_length = length;
}
public void Dispose()
{
_inner.Dispose();
_current = null;
}
public bool MoveNext()
{
var currentBuffer = new List<T>();
while (currentBuffer.Count < _length && !_endOfInner)
{
if (!_inner.MoveNext())
{
_endOfInner = true;
break;
}
currentBuffer.Add(_inner.Current);
}
if (currentBuffer.Count > 0)
{
_current = currentBuffer;
return true;
}
_current = null;
return false;
}
public void Reset()
{
_inner.Reset();
_current = null;
_endOfInner = false;
}
public IReadOnlyList<T> Current
{
get
{
if (_current != null)
{
return _current;
}
throw new InvalidOperationException();
}
}
object IEnumerator.Current
{
get
{
return this.Current;
}
}
}
}