我得到了以下方法,用于读取txt文件并返回字典。读取~5MB文件需要约7分钟(67000行,每行70个字符)。
public static Dictionary<string, string> FASTAFileReadIn(string file)
{
Dictionary<string, string> seq = new Dictionary<string, string>();
Regex re;
Match m;
GroupCollection group;
string currentName = string.Empty;
try
{
using (StreamReader sr = new StreamReader(file))
{
string line = string.Empty;
while ((line = sr.ReadLine()) != null)
{
if (line.StartsWith(">"))
{// Match Sequence
re = new Regex(@"^>(\S+)");
m = re.Match(line);
if (m.Success)
{
group = m.Groups;
if (!seq.ContainsKey(group[1].Value))
{
seq.Add(group[1].Value, string.Empty);
currentName = group[1].Value;
}
}
}
else if (Regex.Match(line.Trim(), @"\S+").Success &&
currentName != string.Empty)
{
seq[currentName] += line.Trim();
}
}
}
}
catch (IOException e)
{
Console.WriteLine("An IO exception has benn thrown!");
Console.WriteLine(e.ToString());
}
finally { }
return seq;
}
代码的哪一部分最耗时,以及如何加快速度?
由于
答案 0 :(得分:3)
我希望编译器会自动执行此操作,但我注意到的第一件事是你在每个匹配行上重新编译正则表达式:
while ((line = sr.ReadLine()) != null)
{
if (line.StartsWith(">"))
{// Match Sequence
re = new Regex(@"^>(\S+)");
如果你可以完全删除正则表达式,那就更好了;大多数语言提供某种类型的split
函数,通常会抽取正则表达式...
答案 1 :(得分:2)
缓存并编译正则表达式,重新排序条件,减少修剪次数等。
public static Dictionary<string, string> FASTAFileReadIn(string file) {
var seq = new Dictionary<string, string>();
Regex re = new Regex(@"^>(\S+)", RegexOptions.Compiled);
Regex nonWhitespace = new Regex(@"\S", RegexOptions.Compiled);
Match m;
string currentName = string.Empty;
try {
foreach(string line in File.ReadLines(file)) {
if(line[0] == '>') {
m = re.Match(line);
if(m.Success) {
if(!seq.ContainsKey(m.Groups[1].Value)) {
seq.Add(m.Groups[1].Value, string.Empty);
currentName = m.Groups[1].Value;
}
}
} else if(currentName != string.Empty) {
if(nonWhitespace.IsMatch(line)) {
seq[currentName] += line.Trim();
}
}
}
} catch(IOException e) {
Console.WriteLine("An IO exception has been thrown!");
Console.WriteLine(e.ToString());
}
return seq;
}
然而,这只是一个天真的优化。阅读FASTA格式,我写道:
public static Dictionary<string, string> ReadFasta(string filename) {
var result = new Dictionary<string, string>
var current = new StringBuilder();
string currentKey = null;
foreach(string line in File.ReadLines(filename)) {
if(line[0] == '>') {
if(currentKey != null) {
result.Add(currentKey, current.ToString());
current.Clear();
}
int i = line.IndexOf(' ', 2);
currentKey = i > -1 ? line.Substring(1, i - 1) : line.Substring(1);
} else if(currentKey != null) {
current.Append(line.TrimEnd());
}
}
if(currentKey != null)
result.Add(currentKey, current.ToString());
return result;
}
告诉我它是否有效;它应该快得多。
答案 2 :(得分:1)
using (FileStream fs = File.Open(file, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
using (BufferedStream bs = new BufferedStream(fs))
using (StreamReader sr = new StreamReader(bs))
{
// Use the StreamReader
}
如果您的处理时间约为5分钟,那么提及的Regex
重新编译@sarnold可能是您最大的性能杀手。
答案 3 :(得分:1)
这是我写它的方式。没有更多信息(即平均字典条目的持续时间),我无法优化StingBuilder的容量。您也可以按照Eric J.的建议添加BufferedStream
。理想情况下,如果你想要提高性能,你会完全取消Regular Expressions
,但它们更容易编写和管理,所以我理解你为什么要使用它们。
public static Dictionary<string, StringBuilder> FASTAFileReadIn(string file)
{
var seq = new Dictionary<string, StringBuilder>();
var regName = new Regex("^>(\\S+)", RegexOptions.Compiled);
var regAppend = new Regex("\\S+", RegexOptions.Compiled);
Match tempMatch = null;
string currentName = string.Empty;
try
{
using (StreamReader sReader = new StreamReader(file))
{
string line = string.Empty;
while ((line = sReader.ReadLine()) != null)
{
if ((tempMatch = regName.Match(line)).Success)
{
if (!seq.ContainsKey(tempMatch.Groups[1].Value))
{
currentName = tempMatch.Groups[1].Value;
seq.Add(currentName, new StringBuilder());
}
}
else if ((tempMatch = regAppend.Match(line)).Success && currentName != string.Empty)
{
seq[currentName].Append(tempMatch.Value);
}
}
}
}
catch (IOException e)
{
Console.WriteLine("An IO exception has been thrown!");
Console.WriteLine(e.ToString());
}
return seq;
}
正如您所看到的,我稍微更改了您的字典,以使用优化的StringBuilder
类来附加值。我还预先编译了一次正则表达式,只是为了确保你不会一遍又一遍地重复编译相同的正则表达式。我还提取了你的“追加”案例,以便编译成正则表达式。
请告诉我这是否有助于您提升表现。