我有一个包含大量字符串的列表(> 5000),我必须将第一个元素与所有后续元素进行比较。例如。考虑这个字符串列表:
{ one, two, three, four, five, six, seven, eight, nine, ten }
。现在我接受one
并将其与two, three, four, ...
进行比较,之后我将two
与three, four, ...
进行比较
我相信查找是为什么这需要这么长时间的问题。在正常的硬盘(7200rpm)上,它需要大约30秒,在ssd 10秒。我只是不知道如何才能加快速度。列表中的所有字符串都按升序排序,重要的是按此顺序检查它们。如果它可以大大加快速度,我不介意有一个无序列表。
我看了一下hashset,但是我需要检查顺序,这样即使使用快速包含方法也行不通。
编辑:因为看起来我不够清楚,并且Dusan想要的是完整的代码。我的问题案例:我有很多目录,名称相似,我只得到一个包含所有目录名称的集合,并相互比较它们的相似性和写作。因此,hdd和ssd之间的比较。但这很奇怪,因为我不是立即写作,而是把它放在一个字段中,然后写到最后。速度仍有差异。 为什么我不包括整个代码?因为我认为我的核心问题是从列表中查找值以及2个字符串之间的比较。其他一切应该已经足够快,添加到列表,查看黑名单(hashset)并获取目录名称列表。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.IO;
using System.Text.RegularExpressions;
using System.Diagnostics;
using System.Threading;
namespace Similarity
{
/// <summary>
/// Credit http://www.dotnetperls.com/levenshtein
/// Contains approximate string matching
/// </summary>
internal static class LevenshteinDistance
{
/// <summary>
/// Compute the distance between two strings.
/// </summary>
public static int Compute(string s, string t)
{
int n = s.Length;
int m = t.Length;
int[,] d = new int[n + 1, m + 1];
// Step 1
if (n == 0)
{
return m;
}
if (m == 0)
{
return n;
}
// Step 2
for (int i = 0; i <= n; d[i, 0] = i++)
{
}
for (int j = 0; j <= m; d[0, j] = j++)
{
}
// Step 3
for (int i = 1; i <= n; i++)
{
//Step 4
for (int j = 1; j <= m; j++)
{
// Step 5
int cost = (t[j - 1] == s[i - 1]) ? 0 : 1;
// Step 6
d[i, j] = Math.Min(
Math.Min(d[i - 1, j] + 1, d[i, j - 1] + 1),
d[i - 1, j - 1] + cost);
}
}
// Step 7
return d[n, m];
}
}
internal class Program
{
#region Properties
private static HashSet<string> _blackList = new HashSet<string>();
public static HashSet<string> blackList
{
get
{
return _blackList;
}
}
public static void AddBlackListEntry(string line)
{
blackList.Add(line);
}
private static List<string> _similar = new List<string>();
public static List<string> similar
{
get
{
return _similar;
}
}
public static void AddSimilarEntry(string s)
{
similar.Add(s);
}
#endregion Properties
private static void Main(string[] args)
{
Clean();
var directories = Directory.EnumerateDirectories(Directory.GetCurrentDirectory(), "*", SearchOption.TopDirectoryOnly)
.Select(x => new DirectoryInfo(x).Name).OrderBy(y => new DirectoryInfo(y).Name).ToList();
using (StreamWriter sw = new StreamWriter(@"result.txt"))
{
foreach (var item in directories)
{
Console.WriteLine(item);
sw.WriteLine(item);
}
Console.WriteLine("Amount of directories: " + directories.Count());
}
if (directories.Count != 0)
{
StartSimilarityCheck(directories);
}
else
{
Console.WriteLine("No directories");
}
WriteResult(similar);
Console.WriteLine("Finish. Press any key to exit...");
Console.ReadKey();
}
private static void StartSimilarityCheck(List<string> whiteList)
{
int counter = 0; // how many did we check yet?
var watch = Stopwatch.StartNew();
foreach (var dirName in whiteList)
{
bool insertDirName = true;
if (!IsBlackList(dirName))
{
// start the next element
for (int i = counter + 1; i <= whiteList.Count; i++)
{
// end of index reached
if (i == whiteList.Count)
{
break;
}
int similiariy = LevenshteinDistance.Compute(dirName, whiteList[i]);
// low score means high similarity
if (similiariy < 2)
{
if (insertDirName)
{
//Writer(dirName);
AddSimilarEntry(dirName);
insertDirName = false;
}
//Writer(whiteList[i]);
AddSimilarEntry(dirName);
AddBlackListEntry(whiteList[i]);
}
}
}
Console.WriteLine(counter);
//Console.WriteLine("Skip: {0}", dirName);
counter++;
}
watch.Stop();
Console.WriteLine("Time: " + watch.ElapsedMilliseconds / 1000);
}
private static void WriteResult(List<string> list)
{
using (StreamWriter sw = new StreamWriter(@"similar.txt", true, Encoding.UTF8, 65536))
{
foreach (var item in list)
{
sw.WriteLine(item);
}
}
}
private static void Clean()
{
// yeah hardcoded file names incoming. Better than global variables??
try
{
if (File.Exists(@"similar.txt"))
{
File.Delete(@"similar.txt");
}
if (File.Exists(@"result.txt"))
{
File.Delete(@"result.txt");
}
}
catch (Exception)
{
throw;
}
}
private static void Writer(string s)
{
using (StreamWriter sw = new StreamWriter(@"similar.txt", true, Encoding.UTF8, 65536))
{
sw.WriteLine(s);
}
}
private static bool IsBlackList(string name)
{
return blackList.Contains(name);
}
}
要修复瓶颈,这是第二个for-loop插入if-condition,检查similiariy是否> =我们想要的,如果是这种情况,那么打破循环。现在它在1秒内运行。谢谢大家
答案 0 :(得分:1)
你的内循环使用奇怪的双重检查。这可能会阻止重要的JIT优化,删除冗余范围检查。
//foreach (var item myList)
for (int j = 0; j < myList.Count-1; j++)
{
string item1 = myList[j];
for (int i = j + 1; i < myList.Count; i++)
{
string item2 = myList[i];
// if (i == myList.Count)
...
}
}
答案 1 :(得分:0)
downvotes的数量很疯狂但很好......由于评论,我找到了我的性能问题/瓶颈的原因。
StartSimilarityCheck()
内的第二个for循环遍历所有条目,这本身没有问题但是在性能问题和高效下查看时很糟糕。解决方案是只检查附近的字符串,但我们如何知道它们是否是?
首先,我们得到一个按提升排序的列表。这给了我们类似字符串的粗略概述。现在我们定义一个threshold
的Levenshtein分数(较小的分数是两个字符串之间的较高相似度)。如果分数高于阈值,则意味着它们不太相似,因此我们可以突破内循环。这节省了时间,程序可以很快完成。请注意,这种方式不是防弹,恕我直言,因为如果第一个字符串是0Directory
,它将位于列表的开头部分,但是像zDirectory
这样的字符串将会进一步向下并且可能会被遗漏。如果我错了,请纠正我。
private static void StartSimilarityCheck(List<string> whiteList)
{
var watch = Stopwatch.StartNew();
for (int j = 0; j < whiteList.Count - 1; j++)
{
string dirName = whiteList[j];
bool insertDirName = true;
int threshold = 2;
if (!IsBlackList(dirName))
{
// start the next element
for (int i = j + 1; i < whiteList.Count; i++)
{
// end of index reached
if (i == whiteList.Count)
{
break;
}
int similiarity = LevenshteinDistance.Compute(dirName, whiteList[i]);
if (similiarity >= threshold)
{
break;
}
// low score means high similarity
if (similiarity <= threshold)
{
if (insertDirName)
{
AddSimilarEntry(dirName);
AddSimilarEntry(whiteList[i]);
AddBlackListEntry(whiteList[i]);
insertDirName = false;
}
else
{
AddBlackListEntry(whiteList[i]);
}
}
}
}
Console.WriteLine(j);
}
watch.Stop();
Console.WriteLine("Ms: " + watch.ElapsedMilliseconds);
Console.WriteLine("Similar entries: " + similar.Count);
}