在.NET中获取目录数据的最快方法

时间:2014-10-12 02:49:30

标签: c# .net windows visual-c++ boost

我正在开发文件同步服务,以便在不同计算机上的两个文件夹之间同步文件。我需要找到一种非常快速的方法来枚举目录并从中提取以下信息:

  • 此目录中所有文件路径和子目录路径的数据结构或结构,包括每个文件或子目录的最后写入时间。
  • 对于在当前目录下任何级别找到的每个子目录,与上面相同。

到目前为止,我已经想出了这个:

static void Main(string[] args)
{
    List<Tuple<string, DateTime>> files = new List<Tuple<string, DateTime>>();
    List<Tuple<string, DateTime>> directories = new List<Tuple<string, DateTime>>();
    Stopwatch watch = new Stopwatch();
    while (true)
    {
        watch.Start();
        while (!CheckFolderRecursiveSingleThreaded("C:\\", out files, out directories))
        {
            // You can assume for all intents and purposes that drive C does exist and that you have access to it, which will cause this sleep to not get called.
            Thread.Sleep(1000);
        }
        watch.Stop();
        Console.WriteLine(watch.ElapsedMilliseconds);
        watch.Reset();
        // Do something with the information.
        Thread.Sleep(1000);
    }
}

static bool CheckFolderRecursiveSingleThreaded(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories)
{
    try
    {
        DirectoryInfo directoryInformation = new DirectoryInfo(path);
        List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>();
        foreach (FileInfo file in directoryInformation.GetFiles())
        {
            fileList.Add(new Tuple<string, DateTime>(file.FullName, file.LastWriteTimeUtc));
        }
        List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>();
        foreach (DirectoryInfo directory in directoryInformation.GetDirectories())
        {
            // Check for the ReparsePoint flag, which will indicate a symbolic link.
            if (!directory.Attributes.HasFlag(FileAttributes.ReparsePoint))
            {
                directoryList.Add(new Tuple<string, DateTime>(directory.FullName, directory.LastWriteTimeUtc));
                List<Tuple<string, DateTime>> directoryFiles;
                List<Tuple<string, DateTime>> directoryFolders;
                if (CheckFolderRecursiveSingleThreaded(directory.FullName, out directoryFiles, out directoryFolders))
                {
                    fileList.AddRange(directoryFiles);
                    directoryList.AddRange(directoryFolders);
                }
            }
        }
        files = fileList;
        directories = directoryList;
        return true;
    }
    catch
    {
        files = null;
        directories = null;
        return false;
    }
}

性能方面,大约需要22秒(无论是在没有附带调试器的情况下在发布或调试模式下运行)通过我的C:\驱动器进行枚举,并生成一个包含大约549,254个文件和83,235个文件夹的列表,但可以更快吗?我对任何建议持开放态度,甚至是MSVC ++建议。

编辑:因为多线程而需要使用LINQ的AsParallel 12秒(必须在发布模式下测试)。请注意,这对所有C:\子文件夹都是并行化的,但是我将对上面的单线程实现进行递归调用,否则一直需要很长时间来并行化所有文件夹!

static bool CheckFolderParallelled(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories)
{
    try
    {
        DirectoryInfo directoryInformation = new DirectoryInfo(path);
        List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>();
        foreach (FileInfo file in directoryInformation.GetFiles())
        {
            fileList.Add(new Tuple<string, DateTime>(file.FullName, file.LastWriteTimeUtc));
        }
        List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>();
        directoryInformation.GetDirectories().AsParallel().ForAll(directory =>
        {
            // Check for the ReparsePoint flag, which will indicate a symbolic link.
            if (!directory.Attributes.HasFlag(FileAttributes.ReparsePoint))
            {
                directoryList.Add(new Tuple<string, DateTime>(directory.FullName, directory.LastWriteTimeUtc));
                List<Tuple<string, DateTime>> directoryFiles;
                List<Tuple<string, DateTime>> directoryFolders;
                if (CheckFolderRecursiveSingleThreaded(directory.FullName, out directoryFiles, out directoryFolders))
                {
                    fileList.AddRange(directoryFiles);
                    directoryList.AddRange(directoryFolders);
                }
            }
        });
        files = fileList;
        directories = directoryList;
        return true;
    }
    catch
    {
        files = null;
        directories = null;
        return false;
    }
}

编辑:使用Alexei的链接解决方案接受Mark Gravell的回答,仍然大约需要21秒。这种非递归技术并不是最快的(可能保持这个Queue数据类型活动的成本与在堆栈上推送和弹出对此方法的调用的成本一样昂贵):

static bool CheckFolderNonRecursive(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories)
{
    try
    {
        List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>();
        List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>();
        ConcurrentQueue<DirectoryInfo> pendingSearches = new ConcurrentQueue<DirectoryInfo>();
        pendingSearches.Enqueue(new DirectoryInfo(path));
        DirectoryInfo pendingDirectory;
        while (pendingSearches.Count > 0)
        {
            if (pendingSearches.TryDequeue(out pendingDirectory))
            {
                try
                {
                    foreach (FileInfo file in pendingDirectory.GetFiles())
                    {
                        fileList.Add(new Tuple<string, DateTime>(file.FullName, file.LastWriteTimeUtc));
                    }
                    foreach (DirectoryInfo directory in pendingDirectory.GetDirectories())
                    {
                        // Check for the ReparsePoint flag, which will indicate a symbolic link.
                        if (!directory.Attributes.HasFlag(FileAttributes.ReparsePoint))
                        {
                            directoryList.Add(new Tuple<string, DateTime>(directory.FullName, directory.LastWriteTimeUtc));
                            pendingSearches.Enqueue(directory);
                        }
                    }
                }
                catch { } // Ignore directories with no access rights.
            }
        }
        files = fileList;
        directories = directoryList;
        return true;
    }
    catch
    {
        files = null;
        directories = null;
        return false;
    }
}

编辑:这个问题对.NET来说是开放式的,因为使用像boost这样的MSVC ++库可能有更快的方法,但我还没有遇到过更快的方法。如果有人能用C ++中更快的C驱动枚举器击败我的C#方法来获取相同的数据,首先要感谢你更快地做到这一点,其次我真的很有兴趣看到它,第三,它会有所帮助很多人(不仅仅是我自己)。直到我意识到以下方法花费了大约200,000毫秒,比我上面发布的任何代码都要长得多,我已经深入了解了这一点:

#include "stdafx.h"
#include <iostream>
#include <Windows.h>
#include <boost/filesystem.hpp>
#include <boost/foreach.hpp>
#include <boost/timer.hpp>

namespace fs = boost::filesystem;

bool IterateDirectory(const wchar_t *directory);

int _tmain(int argc, _TCHAR* argv[])
{
    boost::timer timer = boost::timer();
    while (true)
    {
        timer.restart();
        // L makes it wide, since IterateDirectory takes wchar_t.
        // R makes it a raw string literal, which tells the compiler to parse the string as-is, not escape characters and fancy tricks.
        IterateDirectory(LR"(C:\)");
        std::cout << "Elapsed time: " << timer.elapsed() * 1000 << " ms" << std::endl;
        Sleep(1000);
    }
    return 0;
}

// IterateDirectory takes wchar_t because path.c_str() always returns wchar_t whether you are using unicode or multibyte.
bool IterateDirectory(const wchar_t *directory)
{
    if (boost::filesystem::exists(directory))
    {
        fs::directory_iterator it(directory), eod;
        BOOST_FOREACH(fs::path path, std::make_pair(it, eod))
        {
            try
            {
                if (is_regular_file(path))
                {
                    //std::cout << path << ", last write time: " << last_write_time(path) << '.' << std::endl;
                }
                if (is_directory(path))
                {
                    //std::cout << path << ", last write time: " << last_write_time(path) << '.' << std::endl;
                    // path.c_str() always returns wchar_t, whether you are using unicode or multibyte. This is probably because of multi-language support inside of the Windows operating system and file structure.
                    IterateDirectory(path.c_str());
                }
            }
            catch (...) { } // Ignore directories we don't have access to.
        }
        return true;
    }
    return false;
}

编辑:使用PInvoke来查找FindFirstFile和FindNextFile花了大约6秒来迭代我的整个C盘(感谢重复链接和Sam Saffron的回答)。但是...... 可以更快吗?

[DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)]
public static extern IntPtr FindFirstFileW(string lpFileName, out WIN32_FIND_DATAW lpFindFileData);

[DllImport("kernel32.dll", CharSet = CharSet.Unicode)]
public static extern bool FindNextFile(IntPtr hFindFile, out WIN32_FIND_DATAW lpFindFileData);

[DllImport("kernel32.dll")]
public static extern bool FindClose(IntPtr hFindFile);

[StructLayout(LayoutKind.Sequential, CharSet = CharSet.Unicode)]
public struct WIN32_FIND_DATAW {
    public FileAttributes dwFileAttributes;
    internal System.Runtime.InteropServices.ComTypes.FILETIME ftCreationTime;
    internal System.Runtime.InteropServices.ComTypes.FILETIME ftLastAccessTime;
    internal System.Runtime.InteropServices.ComTypes.FILETIME ftLastWriteTime;
    public int nFileSizeHigh;
    public int nFileSizeLow;
    public int dwReserved0;
    public int dwReserved1;
    [MarshalAs(UnmanagedType.ByValTStr, SizeConst = 260)]
    public string cFileName;
    [MarshalAs(UnmanagedType.ByValTStr, SizeConst = 14)]
    public string cAlternateFileName;
}

static IntPtr INVALID_HANDLE_VALUE = new IntPtr(-1);

static bool FindNextFilePInvokeRecursive(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories)
{
    List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>();
    List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>();
    WIN32_FIND_DATAW findData;
    IntPtr findHandle = INVALID_HANDLE_VALUE;
    List<Tuple<string, DateTime>> info = new List<Tuple<string,DateTime>>();
    try
    {
        findHandle = FindFirstFileW(path + @"\*", out findData);
        if (findHandle != INVALID_HANDLE_VALUE)
        {
            do
            {
                if (findData.cFileName == "." || findData.cFileName == "..") continue;
                string fullPath = path + (path.EndsWith("\\") ? String.Empty : "\\") + findData.cFileName;
                // Check if this is a directory and not a symbolic link since symbolic links could lead to repeated files and folders as well as infinite loops.
                if (findData.dwFileAttributes.HasFlag(FileAttributes.Directory) && !findData.dwFileAttributes.HasFlag(FileAttributes.ReparsePoint))
                {
                    directoryList.Add(new Tuple<string, DateTime>(fullPath, findData.ftLastWriteTime.ToDateTime()));
                    List<Tuple<string, DateTime>> subDirectoryFileList = new List<Tuple<string, DateTime>>();
                    List<Tuple<string, DateTime>> subDirectoryDirectoryList = new List<Tuple<string, DateTime>>();
                    if (FindNextFilePInvokeRecursive(fullPath, out subDirectoryFileList, out subDirectoryDirectoryList))
                    {
                        fileList.AddRange(subDirectoryFileList);
                        directoryList.AddRange(subDirectoryDirectoryList);
                    }
                }
                else if (!findData.dwFileAttributes.HasFlag(FileAttributes.Directory))
                {
                    fileList.Add(new Tuple<string, DateTime>(fullPath, findData.ftLastWriteTime.ToDateTime()));
                }
            }
            while (FindNextFile(findHandle, out findData));
        }
    }
    catch (Exception exception)
    {
        Console.WriteLine("Caught exception while trying to enumerate a directory. {0}", exception.ToString());
        if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
        files = null;
        directories = null;
        return false;
    }
    if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
    files = fileList;
    directories = directoryList;
    return true;
}

public static class FILETIMEExtensions
{
    public static DateTime ToDateTime(this System.Runtime.InteropServices.ComTypes.FILETIME filetime)
    {
        long highBits = filetime.dwHighDateTime;
        highBits = highBits << 32;
        return DateTime.FromFileTimeUtc(highBits + (long)filetime.dwLowDateTime);
    }
}

修改:是的,它可以更快。使用并行化目标文件夹的子目录递归的技术,我可以使用上面的FindNextFilePInvokeRecursive方法将其获得4秒。那是 4秒用我需要的数据迭代我的整个C盘。我可以在进程监视器中看到,我吃掉了大约30%的CPU,最多只占1%的磁盘,这对我来说有点奇怪,不知道为什么会这样,也许只是这个链表遍历样式导致它成为相当微不足道。理想情况下,它应该至少吃掉100%的CPU,但这可能取决于您并行化的子文件夹的数量和深度。 但它可以更快吗?!

static bool FindNextFilePInvokeRecursiveParalleled(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories)
{
    List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>();
    List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>();
    WIN32_FIND_DATAW findData;
    IntPtr findHandle = INVALID_HANDLE_VALUE;
    List<Tuple<string, DateTime>> info = new List<Tuple<string, DateTime>>();
    try
    {
        findHandle = FindFirstFileW(path + @"\*", out findData);
        if (findHandle != INVALID_HANDLE_VALUE)
        {
            do
            {
                if (findData.cFileName == "." || findData.cFileName == "..") continue;
                string fullPath = path + (path.EndsWith("\\") ? String.Empty : "\\") + findData.cFileName;
                // Check if this is a directory and not a symbolic link since symbolic links could lead to repeated files and folders as well as infinite loops.
                if (findData.dwFileAttributes.HasFlag(FileAttributes.Directory) && !findData.dwFileAttributes.HasFlag(FileAttributes.ReparsePoint))
                {
                    directoryList.Add(new Tuple<string, DateTime>(fullPath, findData.ftLastWriteTime.ToDateTime()));
                }
                else if (!findData.dwFileAttributes.HasFlag(FileAttributes.Directory))
                {
                    fileList.Add(new Tuple<string, DateTime>(fullPath, findData.ftLastWriteTime.ToDateTime()));
                }
            }
            while (FindNextFile(findHandle, out findData));
            directoryList.AsParallel().ForAll(x =>
            {
                List<Tuple<string, DateTime>> subDirectoryFileList = new List<Tuple<string, DateTime>>();
                List<Tuple<string, DateTime>> subDirectoryDirectoryList = new List<Tuple<string, DateTime>>();
                if (FindNextFilePInvokeRecursive(x.Item1, out subDirectoryFileList, out subDirectoryDirectoryList))
                {
                    fileList.AddRange(subDirectoryFileList);
                    directoryList.AddRange(subDirectoryDirectoryList);
                }
            });
        }
    }
    catch (Exception exception)
    {
        Console.WriteLine("Caught exception while trying to enumerate a directory. {0}", exception.ToString());
        if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
        files = null;
        directories = null;
        return false;
    }
    if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
    files = fileList;
    directories = directoryList;
    return true;
}

编辑:忘记在使用parallels时添加并发锁,否则您可能会遇到异常。还删除了元组,并为我的目的使用了FileInformation / DirectoryInformation类。这刮掉了0.5秒。现在 3.5秒来枚举我的C:驱动器。

[DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)]
public static extern IntPtr FindFirstFileW(string lpFileName, out WIN32_FIND_DATAW lpFindFileData);

[DllImport("kernel32.dll", CharSet = CharSet.Unicode)]
public static extern bool FindNextFile(IntPtr hFindFile, out WIN32_FIND_DATAW lpFindFileData);

[DllImport("kernel32.dll")]
public static extern bool FindClose(IntPtr hFindFile);

[StructLayout(LayoutKind.Sequential, CharSet = CharSet.Unicode)]
public struct WIN32_FIND_DATAW {
    public FileAttributes dwFileAttributes;
    internal System.Runtime.InteropServices.ComTypes.FILETIME ftCreationTime;
    internal System.Runtime.InteropServices.ComTypes.FILETIME ftLastAccessTime;
    internal System.Runtime.InteropServices.ComTypes.FILETIME ftLastWriteTime;
    public int nFileSizeHigh;
    public int nFileSizeLow;
    public int dwReserved0;
    public int dwReserved1;
    [MarshalAs(UnmanagedType.ByValTStr, SizeConst = 260)]
    public string cFileName;
    [MarshalAs(UnmanagedType.ByValTStr, SizeConst = 14)]
    public string cAlternateFileName;
}

static IntPtr INVALID_HANDLE_VALUE = new IntPtr(-1);

static bool FindNextFilePInvokeRecursive(string path, out List<FileInformation> files, out List<DirectoryInformation> directories)
{
    List<FileInformation> fileList = new List<FileInformation>();
    List<DirectoryInformation> directoryList = new List<DirectoryInformation>();
    WIN32_FIND_DATAW findData;
    IntPtr findHandle = INVALID_HANDLE_VALUE;
    List<Tuple<string, DateTime>> info = new List<Tuple<string, DateTime>>();
    try
    {
        findHandle = FindFirstFileW(path + @"\*", out findData);
        if (findHandle != INVALID_HANDLE_VALUE)
        {
            do
            {
                // Skip current directory and parent directory symbols that are returned.
                if (findData.cFileName != "." && findData.cFileName != "..")
                {
                    string fullPath = path + @"\" + findData.cFileName;
                    // Check if this is a directory and not a symbolic link since symbolic links could lead to repeated files and folders as well as infinite loops.
                    if (findData.dwFileAttributes.HasFlag(FileAttributes.Directory) && !findData.dwFileAttributes.HasFlag(FileAttributes.ReparsePoint))
                    {
                        directoryList.Add(new DirectoryInformation { FullPath = fullPath, LastWriteTime = findData.ftLastWriteTime.ToDateTime() });
                        List<FileInformation> subDirectoryFileList = new List<FileInformation>();
                        List<DirectoryInformation> subDirectoryDirectoryList = new List<DirectoryInformation>();
                        if (FindNextFilePInvokeRecursive(fullPath, out subDirectoryFileList, out subDirectoryDirectoryList))
                        {
                            fileList.AddRange(subDirectoryFileList);
                            directoryList.AddRange(subDirectoryDirectoryList);
                        }
                    }
                    else if (!findData.dwFileAttributes.HasFlag(FileAttributes.Directory))
                    {
                        fileList.Add(new FileInformation { FullPath = fullPath, LastWriteTime = findData.ftLastWriteTime.ToDateTime() });
                    }
                }
            }
            while (FindNextFile(findHandle, out findData));
        }
    }
    catch (Exception exception)
    {
        Console.WriteLine("Caught exception while trying to enumerate a directory. {0}", exception.ToString());
        if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
        files = null;
        directories = null;
        return false;
    }
    if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
    files = fileList;
    directories = directoryList;
    return true;
}

static bool FindNextFilePInvokeRecursiveParalleled(string path, out List<FileInformation> files, out List<DirectoryInformation> directories)
{
    List<FileInformation> fileList = new List<FileInformation>();
    object fileListLock = new object();
    List<DirectoryInformation> directoryList = new List<DirectoryInformation>();
    object directoryListLock = new object();
    WIN32_FIND_DATAW findData;
    IntPtr findHandle = INVALID_HANDLE_VALUE;
    List<Tuple<string, DateTime>> info = new List<Tuple<string, DateTime>>();
    try
    {
        path = path.EndsWith(@"\") ? path : path + @"\";
        findHandle = FindFirstFileW(path + @"*", out findData);
        if (findHandle != INVALID_HANDLE_VALUE)
        {
            do
            {
                // Skip current directory and parent directory symbols that are returned.
                if (findData.cFileName != "." && findData.cFileName != "..")
                {
                    string fullPath = path + findData.cFileName;
                    // Check if this is a directory and not a symbolic link since symbolic links could lead to repeated files and folders as well as infinite loops.
                    if (findData.dwFileAttributes.HasFlag(FileAttributes.Directory) && !findData.dwFileAttributes.HasFlag(FileAttributes.ReparsePoint))
                    {
                        directoryList.Add(new DirectoryInformation { FullPath = fullPath, LastWriteTime = findData.ftLastWriteTime.ToDateTime() });
                    }
                    else if (!findData.dwFileAttributes.HasFlag(FileAttributes.Directory))
                    {
                        fileList.Add(new FileInformation { FullPath = fullPath, LastWriteTime = findData.ftLastWriteTime.ToDateTime() });
                    }
                }
            }
            while (FindNextFile(findHandle, out findData));
            directoryList.AsParallel().ForAll(x =>
            {
                List<FileInformation> subDirectoryFileList = new List<FileInformation>();
                List<DirectoryInformation> subDirectoryDirectoryList = new List<DirectoryInformation>();
                if (FindNextFilePInvokeRecursive(x.FullPath, out subDirectoryFileList, out subDirectoryDirectoryList))
                {
                    lock (fileListLock)
                    {
                        fileList.AddRange(subDirectoryFileList);
                    }
                    lock (directoryListLock)
                    {
                        directoryList.AddRange(subDirectoryDirectoryList);
                    }
                }
            });
        }
    }
    catch (Exception exception)
    {
        Console.WriteLine("Caught exception while trying to enumerate a directory. {0}", exception.ToString());
        if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
        files = null;
        directories = null;
        return false;
    }
    if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
    files = fileList;
    directories = directoryList;
    return true;
}

public class FileInformation
{
    public string FullPath;
    public DateTime LastWriteTime;
}

public class DirectoryInformation
{
    public string FullPath;
    public DateTime LastWriteTime;
}

修改:B.K。询问从FILETIME转换到DateTime:

public static class FILETIMEExtensions
{
    public static DateTime ToDateTime(this System.Runtime.InteropServices.ComTypes.FILETIME time)
    {
        ulong high = (ulong)time.dwHighDateTime;
        ulong low = (ulong)time.dwLowDateTime;
        long fileTime = (long)((high << 32) + low);
        return DateTime.FromFileTimeUtc(fileTime);
    }
}

1 个答案:

答案 0 :(得分:1)

使用LINQ和并行任务

var stuff = dir.GetFiles("*.*", System.IO.SearchOption.AllDirectories);  
Parallel.ForEach(stuff, p=>{ //do things in parrallel..  });
//or this 
var q = stuff.AsParallel().Where(x => p(x)).Orderby(x => k(x)).Select(x => f(x));       foreach (var e in q) a(e);