我正在使用c#创建一个工具,它遍历一个大文件目录并提取某些信息。该目录按语言(LCID)组织,所以我想使用多线程来浏览目录 - 每个语言文件夹一个线程。
我的代码目前扫描少量文件并提取所需数据而不进行多线程处理,但是大规模需要花费太长时间。
我在我的循环中设置了一个获取LCID文件夹的线程,但是出现了以下错误:“'HBscan'没有重载匹配委托System.threading.threadstart”。从我在网上看到的,然后我把我的方法放在一个类中,所以我可以有参数,现在没有错误,但代码没有正确地迭代文件。它将文件从扫描中删除。
我想知道是否有人能够看到我的代码在哪里出错,导致它无法正常运行?感谢。
public static void Main(string[] args)
{
//change rootDirectory variable to point to directory which you wish to scan through
string rootDirectory = @"C:\sample";
DirectoryInfo dir = new DirectoryInfo(rootDirectory);
//get the LCIDs from the folders
string[] filePaths = Directory.GetDirectories(rootDirectory);
for (int i = 0; i < filePaths.Length; i++)
{
string LCID = filePaths[i].Split('\\').Last();
Console.WriteLine(LCID);
HBScanner scanner = new HBScanner(new DirectoryInfo(filePaths[i]));
Thread t1 = new Thread(new ThreadStart(scanner.HBscan));
t1.Start();
}
Console.WriteLine("Scanning through files...");
}
public class HBScanner
{
private DirectoryInfo DirectoryToScan { get; set; }
public HBScanner(DirectoryInfo startDir)
{
DirectoryToScan = startDir;
}
public void HBscan()
{
HBscan(DirectoryToScan);
}
public static void HBscan(DirectoryInfo directoryToScan)
{
//create an array of files using FileInfo object
FileInfo[] files;
//get all files for the current directory
files = directoryToScan.GetFiles("*.*");
string asset = "";
string lcid = "";
//iterate through the directory and get file details
foreach (FileInfo file in files)
{
String name = file.Name;
DateTime lastModified = file.LastWriteTime;
String path = file.FullName;
//first check the file name for asset id using regular expression
Regex regEx = new Regex(@"([A-Z][A-Z][0-9]{8,10})\.");
asset = regEx.Match(file.Name).Groups[1].Value.ToString();
//get LCID from the file path using regular expression
Regex LCIDregEx = new Regex(@"sample\\(\d{4,5})");
lcid = LCIDregEx.Match(file.FullName).Groups[1].Value.ToString();
//if it can't find it from filename, it looks into xml
if (file.Extension == ".xml" && asset == "")
{
System.Diagnostics.Debug.WriteLine("File is an .XML");
System.Diagnostics.Debug.WriteLine("file.FullName is: " + file.FullName);
XmlDocument xmlDoc = new XmlDocument();
xmlDoc.Load(path);
//load XML file in
//check for <assetid> element
XmlNode assetIDNode = xmlDoc.GetElementsByTagName("assetid")[0];
//check for <Asset> element
XmlNode AssetIdNodeWithAttribute = xmlDoc.GetElementsByTagName("Asset")[0];
//if there is an <assetid> element
if (assetIDNode != null)
{
asset = assetIDNode.InnerText;
}
else if (AssetIdNodeWithAttribute != null) //if there is an <asset> element, see if it has an AssetID attribute
{
//get the attribute
asset = AssetIdNodeWithAttribute.Attributes["AssetId"].Value;
if (AssetIdNodeWithAttribute.Attributes != null)
{
var attributeTest = AssetIdNodeWithAttribute.Attributes["AssetId"];
if (attributeTest != null)
{
asset = attributeTest.Value;
}
}
}
}
Item newFile = new Item
{
AssetID = asset,
LCID = lcid,
LastModifiedDate = lastModified,
Path = path,
FileName = name
};
Console.WriteLine(newFile);
}
//get sub-folders for the current directory
DirectoryInfo[] dirs = directoryToScan.GetDirectories("*.*");
foreach (DirectoryInfo dir in dirs)
{
HBscan(dir);
}
}
}
答案 0 :(得分:3)
我没有检查,但我认为这可行。
代码将为每个线程创建一个扫描程序并执行HBscan方法。
public static void Main(string[] args)
{
//change rootDirectory variable to point to directory which you wish to scan through
string rootDirectory = @"C:\sample";
DirectoryInfo dir = new DirectoryInfo(rootDirectory);
//get the LCIDs from the folders
string[] filePaths = Directory.GetDirectories(rootDirectory);
for (int i = 0; i < filePaths.Length; i++)
{
string LCID = filePaths[i].Split('\\').Last();
Console.WriteLine(LCID);
Thread t1 = new Thread(() => new HBScanner(new DirectoryInfo(filePaths[i])).HBscan());
t1.Start();
}
Console.WriteLine("Scanning through files...");
}
public class HBScanner
{
private DirectoryInfo DirectoryToScan { get; set; }
public HBScanner(DirectoryInfo startDir)
{
DirectoryToScan = startDir;
}
public void HBscan()
{
HBscan(DirectoryToScan);
}
public static void HBscan(DirectoryInfo directoryToScan)
{
//create an array of files using FileInfo object
FileInfo[] files;
//get all files for the current directory
files = directoryToScan.GetFiles("*.*");
string asset = "";
string lcid = "";
//iterate through the directory and get file details
foreach (FileInfo file in files)
{
String name = file.Name;
DateTime lastModified = file.LastWriteTime;
String path = file.FullName;
//first check the file name for asset id using regular expression
Regex regEx = new Regex(@"([A-Z][A-Z][0-9]{8,10})\.");
asset = regEx.Match(file.Name).Groups[1].Value.ToString();
//get LCID from the file path using regular expression
Regex LCIDregEx = new Regex(@"sample\\(\d{4,5})");
lcid = LCIDregEx.Match(file.FullName).Groups[1].Value.ToString();
//if it can't find it from filename, it looks into xml
if (file.Extension == ".xml" && asset == "")
{
System.Diagnostics.Debug.WriteLine("File is an .XML");
System.Diagnostics.Debug.WriteLine("file.FullName is: " + file.FullName);
XmlDocument xmlDoc = new XmlDocument();
xmlDoc.Load(path);
//load XML file in
//check for <assetid> element
XmlNode assetIDNode = xmlDoc.GetElementsByTagName("assetid")[0];
//check for <Asset> element
XmlNode AssetIdNodeWithAttribute = xmlDoc.GetElementsByTagName("Asset")[0];
//if there is an <assetid> element
if (assetIDNode != null)
{
asset = assetIDNode.InnerText;
}
else if (AssetIdNodeWithAttribute != null) //if there is an <asset> element, see if it has an AssetID attribute
{
//get the attribute
asset = AssetIdNodeWithAttribute.Attributes["AssetId"].Value;
if (AssetIdNodeWithAttribute.Attributes != null)
{
var attributeTest = AssetIdNodeWithAttribute.Attributes["AssetId"];
if (attributeTest != null)
{
asset = attributeTest.Value;
}
}
}
}
Item newFile = new Item
{
AssetID = asset,
LCID = lcid,
LastModifiedDate = lastModified,
Path = path,
FileName = name
};
Console.WriteLine(newFile);
}
//get sub-folders for the current directory
DirectoryInfo[] dirs = directoryToScan.GetDirectories("*.*");
foreach (DirectoryInfo dir in dirs)
{
HBscan(dir);
}
}
}
答案 1 :(得分:2)
如果您使用的是.NET 4.0,则可以使用TPL并使用Parallel.For/Parallel.ForEach同时处理多个项目。
我几天前刚接触过它,非常有趣。它通过在不同内核上使用多个线程来加速您的工作,从而为您提供出色的性能。原因在于,由于过多的IO访问,这可能会受到限制。
但值得一试! (改变你当前的来源是非常容易的,只是检查出来)
答案 2 :(得分:2)
更像这样的东西呢,
public static void Main(string[] args)
{
const string rootDirectory = @"C:\sample";
Directory.EnumerateDirectories(rootDirectory)
.AsParallel()
.ForAll(f => HBScannner.HBScan(new DirectoryInfo(f)));
}
毕竟,你只能在循环体内获得LCID以将其写入控制台。如果你想要将控制权写入控制台,那么
public static void Main(string[] args)
{
const string rootDirectory = @"C:\sample";
Console.WriteLine("Scanning through files...");
Directory.EnumerateDirectories(rootDirectory)
.AsParallel()
.ForAll(f =>
{
var lcid = f.Split('\\').Last();
Console.WriteLine(lcid);
HBScannner.HBScan(new DirectoryInfo(f));
});
}
请注意,EnumerateDirectories
的使用应优先于GetDirectories
,因为它是惰性评估的,因此您的处理可以在找到第一个目录后立即开始。您不必等待将所有目录加载到列表中。
答案 3 :(得分:1)
使用BlockingCollection http://msdn.microsoft.com/en-us/library/dd267312.aspx可以大大改善您的任务。
整体结构是这样的:你创建一个线程(或在主线程中执行此操作),它将枚举文件并将它们添加到BlockingCollection中。简单地枚举文件,应该相当快,并且这个线程应该比工作线程更快地完成。
然后,您创建了许多任务(与Environment.ProcessorCount相同的数字将是好的)。这些任务应该与docs的第一个示例(collection.Take())类似。任务应该对一个单独的文件执行检查。
因此,将导致一个线程正在查找文件名并将它们放入BlockingCollection中,并行的其他线程将检查文件内容。这样你就可以获得更好的并行性,因为如果你为文件夹创建线程,这可能会造成不均匀的工作分配(你不知道每个文件夹中都有很多文件,对吧?)