LINQ连接来自HashSet,Join vs Dictionary和HashSet性能的实体

时间:2015-06-19 10:04:00

标签: c# performance linq

我有HashSet,每个存储T,我编写了一个测试应用程序来比较我能想到的不同关系算法,但是我对我得到的结果并不满意。

是否存在更有效的方法来实现对象关系,那么我正在测试一次?

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Diagnostics;

namespace LINQTests
{
    class Program
    {
        static void Main(string[] args)
        {
            HashSet<User> UserTable = new HashSet<User>();
            HashSet<UserProperty> UserPropertyTable = new HashSet<UserProperty>();

            #region Lets create some dummy data.

            Console.WriteLine("Please wait while creating dummy data, this can take a while...");
            Console.WriteLine("");

            int rows = 1000000;
            for(int x = 0; x < rows; x++)
            {
                Random rnd = new Random();

                // Add new user.
                User user = new User(string.Format("Joachim-{0}", x));
                if(!UserTable.Add(user))
                {
                    // throw new Exception.
                }
                else
                {
                    UserProperty age = new UserProperty(user, "Age", rnd.Next(25, 30).ToString());
                    if(!UserPropertyTable.Add(age))
                    {
                        // throw new Exception.
                    }

                    UserProperty sex = new UserProperty(user, "Sex", "Male");
                    if (!UserPropertyTable.Add(sex))
                    {
                        // throw new Exception.
                    }

                    UserProperty location = new UserProperty(user, "Location", "Norway");
                    if (!UserPropertyTable.Add(location))
                    {
                        // throw new Exception.
                    }
                }
            }

            #endregion

            #region Lets do some query tests.

            IEnumerable<User> Users;
            Stopwatch stopwatch = new Stopwatch();
            int matches = 0;

            // Lets find all users who are of age 29.
            Console.WriteLine("Finding all users who are of age 29");
            Console.WriteLine("");
            Console.WriteLine("---------------------------------------------------");
            Console.WriteLine("{0,-20} | {1,6} | {2,9}", "Search Strategy", "Found", "Time");
            Console.WriteLine("---------------------------------------------------");

            // Join test.
            stopwatch.Start();

            Users = (from user in UserTable
                     join property in UserPropertyTable on user.Id equals property.UserId
                     where property.Key == "Age" && property.Value == "29"
                     select user);

            matches = Users.Count();
            stopwatch.Stop();
            Console.WriteLine("{0,-20} | {1,6} | {2,6} ms.", "Joining Tables", matches, stopwatch.ElapsedMilliseconds);

            // Dictionary test.
            stopwatch.Restart();

            var DictionarySearch = (from t in UserPropertyTable where t.Key == "Age" && t.Value == "29" select t).ToDictionary(x => x.UserId);
            Users = (from t in UserTable where DictionarySearch.ContainsKey(t.Id) select t);

            matches = Users.Count();
            stopwatch.Stop();
            Console.WriteLine("{0,-20} | {1,6} | {2,6} ms.", "Dictionary Contain", matches, stopwatch.ElapsedMilliseconds);

            // HashSet test.
            stopwatch.Restart();

            var HashsetSearch = new HashSet<Guid>(from t in UserPropertyTable where t.Key == "Age" && t.Value == "29" select t.UserId);

            Users = (from t in UserTable where HashsetSearch.Contains(t.Id) select t);

            matches = Users.Count();
            stopwatch.Stop();
            Console.WriteLine("{0,-20} | {1,6} | {2,6} ms.", "HashSet Contain", matches, stopwatch.ElapsedMilliseconds);

            // Following takes so long that we wont run them!

            //// Array test.
            //stopwatch.Restart();

            //var ArrayMatch = (from t in UserPropertyTable where t.Key == "Age" && t.Value == "29" select t.UserId).ToArray();
            //Users = (from t in UserTable where ArrayMatch.Contains(t.Id) select t);

            //matches = Users.Count();
            //stopwatch.Stop();
            //Console.WriteLine("{0,-20} | {1,6} | {2,6} ms.", "Array Contain", matches, stopwatch.ElapsedMilliseconds);


            //// List test.
            //stopwatch.Restart();

            //var ListMatch = (from t in UserPropertyTable where t.Key == "Age" && t.Value == "29" select t.UserId).ToList();
            //Users = (from t in UserTable where ListMatch.Contains(t.Id) select t);

            //matches = Users.Count();
            //stopwatch.Stop();
            //Console.WriteLine("{0,-20} | {1,6} | {2,6} ms.", "List Contain", matches, stopwatch.ElapsedMilliseconds);


            Console.WriteLine("---------------------------------------------------");

            #endregion


            Console.WriteLine("");
            Console.WriteLine("Hit return to exit...");
            Console.Read();
        }
    }

    public class User
    {
        public User(string UserName)
        {
            this.Id = Guid.NewGuid();
            this.UserName = UserName;
        }

        public Guid Id { get; set; }
        public string UserName { get; set; }

        public override bool Equals(object obj)
        {
            User other = obj as User;

            if (other == null)
                return false;

            return this.Id == other.Id;
        }

        public override int GetHashCode()
        {
            return Id.GetHashCode();
        }
    }

    public class UserProperty
    {
        public UserProperty(User user, string key, string value)
        {
            this.Id = Guid.NewGuid();
            this.UserId = user.Id;
            this.Key = key;
            this.Value = value;
        }

        public Guid Id { get; private set; }
        public Guid UserId {get; private set;}
        public string Key { get; set; }
        public string Value { get; set; }

        public override bool Equals(object obj)
        {
            UserProperty other = obj as UserProperty;

            if (other == null)
                return false;

            return this.UserId == other.UserId && this.Key == other.Key;
        }

        public override int GetHashCode()
        {
            return string.Format("{0}-{1}", this.UserId, this.Key).GetHashCode();
        }
    }
}

2 个答案:

答案 0 :(得分:2)

您可以执行以下操作:

  1. 为每个添加的元素和被探测的每个元素调用GetHashCode(哈希表如何比较两个哈希码?)。对于每个探测,至少调用一次等于一次。优化那些。不要使用string.Format(!)。它需要解析格式字符串并分配内存。 UserId.GHC() * 37 ^ Name.GHC()应该这样做。
  2. 不要使用LINQ。编写手动循环。 LINQ对每个正在处理的项目有多个vcall或委托调用。这会添加指令,使CPU停顿并阻止内联。
  3. 你能预先计算数据结构吗?预计算哈希表或排序列表。合并两个排序列表可能比使用哈希表更快。这不是框架内置的。很多用于编写合并连接的硬自定义代码。
  4. 为什么在测量中包含property.Key == "Age" && property.Value == "29"谓词?是否需要为每次执行运行?您可以尝试使用整数值而不是字符串来优化它。也许您可以预先计算UserPropertyTable的索引,以便您可以在固定时间内获得匹配的元素。
  5. 根据您的实施程度,这应该为您提供1-10倍的加速。如果你需要比这更快,我需要先问一些问题。通常,您可以找到仅适用于特定情况的专业技巧。

答案 1 :(得分:1)

这将使linq / join与其他方法相当:

var properties = UserPropertyTable
    .Where(p=>p.Key == "Age" && p.Value == "29")
    .ToArray();

Users = (from user in UserTable
         join property in properties
         on user.Id equals property.UserId
         select user);

这是我能达到的最快(~2X)

var filteredUserIds = new HashSet<Guid>(
        UserPropertyTable
            .Where(p=>p.Key == "Age" && p.Value == "29")
            .Select(p=>p.UserId));

Users = (from user in UserTable
         where filteredUserIds.Contains(user.Id)
         select user);

输出

---------------------------------------------------
Search Strategy      |  Found |      Time
---------------------------------------------------
My method            | 210366 |    157 ms.
Dictionary Contain   | 210366 |    325 ms.
HashSet Contain      | 210366 |    325 ms.
---------------------------------------------------