【源码】浅看Dictionary(.Net Core)源码

注:本文参考的源码来自System.Private.CoreLib.dll,会有部分代码逻辑不同于.Net Framework的Dictionary。如需了解可参考【源码】浅看Dictionary(mscorlib)源码

.Net Core的 Dictionary(严格来说是.Net Core整个框架)的实现使用了大量的 ref,目的是尽可能使用栈而不是堆来分配内存进而提高性能

在自定义类重写Equals()GetHashCode()的时候有些好奇,所以想看看Dictionary查找Key和存储键值对的原理。

下面通过看看几个最常用的Dictionary的方法,了解一下Dictionary的实现原理。

GetHashCode

首先看看如果在重写Equals()的时候不重写GetHashCode()会出现什么情况

class Cat
{
    public string Name;
    public string Birth;

    public bool Equals(Cat cat)
    {
        if (cat is null)
            return false;
        Console.WriteLine("自定义_Cat");
        return Name == cat.Name && Birth == cat.Birth;
    }
    public override bool Equals(object obj) => Equals(obj as Cat);
}
static void Main(string[] args)
{
    Dictionary<Cat, string> cats = new Dictionary<Cat, string>()
    {
        { new Cat { Name="小黑", Birth = "2020-2-1" }, "小黑" },
        { new Cat { Name="小黄", Birth = "2020-4-1" }, "小黄" },
        { new Cat { Name="憨豆", Birth = "2020-5-1" }, "憨豆" },
        { new Cat { Name="小黑", Birth = "2020-2-1" }, "小黑2" },
    };
    var xiaohei = new Cat { Name = "小黑", Birth = "2020-2-1" };
    Console.WriteLine(cats.ContainsKey(xiaohei));
}

// 输出
// False

可以看到虽然有两个“一样”的Key,{ new Cat { Name="小黑", Birth = "2020-2-1" }, "小黑" }{ new Cat { Name="小黑", Birth = "2020-2-1" }, "小黑2" },但是Dictionary还是正常初始化了,而在ContainsKey()时找不到Key。

ContainsKey

源码中的ContainsKey()直接return!Unsafe.IsNullRef(ref FindValue(key));,再看看FindValue(TKey key)的实现

private ref TValue FindValue(TKey key)
{
    if (key == null)
    {
        ThrowHelper.ThrowArgumentNullException(ExceptionArgument.key);
    }

    ref Entry entry = ref Unsafe.NullRef<Entry>();
    if (_buckets != null)
    {
        Debug.Assert(_entries != null, "expected entries to be != null");
        IEqualityComparer<TKey>? comparer = _comparer;
        // 下面的几个if else只是针对TKey类型的不同(或者有无传递comparer参数)而使用对应的EqualityComparer<TValue>去做比较,代码逻辑完全一样,所以这里只讲其中的一种情况。
        if (comparer == null)
        {	// 根据构造函数逻辑,如果Tkey不是string类型,且未传递comparer参数(或传递的comparer为EqualityComparer<TKey>.Default),则_comparer为null,而使用EqualityComparer<TKey>.Default
            uint hashCode = (uint)key.GetHashCode();
            int i = GetBucket(hashCode);	// 取模获取_buckets的对应下标
            Entry[]? entries = _entries;
            uint collisionCount = 0;
            if (typeof(TKey).IsValueType)
            {	// TKey为值类型
                // ValueType: Devirtualize with EqualityComparer<TValue>.Default intrinsic

                i--; // Value in _buckets is 1-based; subtract 1 from i. We do it here so it fuses with the following conditional.
                do
                {
                    if ((uint)i >= (uint)entries.Length) 
                    {	// Initialize(int capacity)初始化时,int[] buckets = new int[size],所以如果i==0,说明找不到元素
                        goto ReturnNotFound;
                    }

                    entry = ref entries[i];
                    if (entry.hashCode == hashCode && EqualityComparer<TKey>.Default.Equals(entry.key, key))
                    {	// 只有在hashCode相等和Equals返回true的同时才认为两个元素相同
                        goto ReturnFound;
                    }

                    i = entry.next;	// 检查下一个hashCode相同的元素

                    collisionCount++;	// 记录哈希值冲突数(若不理解,可到下方介绍Add方法前提供的链接了解)
                } while (collisionCount <= (uint)entries.Length);

                // The chain of entries forms a loop; which means a concurrent update has happened.
                // Break out of the loop and throw, rather than looping forever.
                // 这两句的意思是当跳出上面的while循环时,说明有并发的update事件发生,则抛出相应的异常。这也说明Dictionary并不是线程安全的。
                goto ConcurrentOperation;
            }
            else	// TKey为非值类型
            {
                // Object type: Shared Generic, EqualityComparer<TValue>.Default won't devirtualize
                // https://github.com/dotnet/runtime/issues/10050
                // So cache in a local rather than get EqualityComparer per loop iteration
                EqualityComparer<TKey> defaultComparer = EqualityComparer<TKey>.Default;
		       // ...省略代码,感兴趣去页面底部链接查看源码
            }
        }
        else		// TKey为string类型,或已传递comparer参数
        {
            uint hashCode = (uint)comparer.GetHashCode(key);
            // ...省略代码,感兴趣去页面底部链接查看源码
        }
    }

    goto ReturnNotFound;

    ConcurrentOperation:
    ThrowHelper.ThrowInvalidOperationException_ConcurrentOperationsNotSupported();
    ReturnFound:
    ref TValue value = ref entry.value;
    Return:
    return ref value;
    ReturnNotFound:
    value = ref Unsafe.NullRef<TValue>();
    goto Return;
}

if (entry.hashCode == hashCode && EqualityComparer.Default.Equals(entry.key, key))

所以需要hashCode相同并且comparer.Equals(entries[i].key, key)返回true才认为两个Key一样,因为重写了Equals(),所以comparer.Equals(entries[i].key, key)返回的是true,而在没有重写GetHashCode()时,默认使用Object.GetHashCode()获取hashCode,我没有找到这个函数的实现,但基本可以确定跟对象的存储地址是有关系的,而我们是new了一个对象去找Key的,所以得到的hashCode必然跟字典中的Key的hashCode不一致,所以会出现这样的现象。

所以必须同时重写Equals()GetHashCode(),比如在这个例子中,根据Name和Birth确定是否同一只Cat,

public override int GetHashCode() {
    return Name.GetHashCode() ^ Birth.GetHashCode();
}

此时调用的是string.GetHashCode(),生成的哈希值只与string有关,只要string一样,得到的hashCode是一样的。



Dictionary的基本原理其实就是利用一个hash表对Key和Key_Value进行存储和寻址。要看懂源码先要有点哈希表相关的基础,没有数据结构基础的慢慢看也不难懂。

Add

先上图

左边的Dictionary有一个元素,entries[0].hashCode = 8,8 % buckets.Length = 1,因此,entries[0].next = buckets[1] - 1 = -1,buckets[1] = 0 + 1 = 1,实际指向entries[0]。

此时插入第二个元素,entries[1].hashCode = 15,15 % buckets.Length = 1,因此,entries[1].next = buckets[1] - 1 = 0,buckets[1] = 1 + 1 = 2,实际指向entries[1]。

带着图看源码

public void Add(TKey key, TValue value) {
    bool modified = TryInsert(key, value, InsertionBehavior.ThrowOnExisting);
    Debug.Assert(modified); // If there was an existing key and the Add failed, an exception will already have been thrown.
}

再看TryInsert()

这里有两个关键的变量_entries_buckets

private Entry[]? _entries;	// 按下标顺序存储每一个元素、对应的hashCode、下一个entry的下标
private int[]? _buckets;	// 存储entries下标(+1)(.Net Framework里的buckets无需+1)的哈希表,用元素的hashCode对buckets.Length取余作为下标可快速找到对应的entry

private struct Entry {
    public int hashCode;    // Lower 31 bits of hash code, -1 if unused
    public int next;        // Index of next entry, -1 if last
    public TKey key;           // Key of entry
    public TValue value;         // Value of entry
}
private void TryInsert(TKey key, TValue value, InsertionBehavior behavior) {
    if (key == null)
    {
        ThrowHelper.ThrowArgumentNullException(ExceptionArgument.key);
    }

    if (_buckets == null)	// 如果_buckets为null要先初始化
    {
        Initialize(0);
    }
    Debug.Assert(_buckets != null);

    Entry[]? entries = _entries;
    Debug.Assert(entries != null, "expected entries to be non-null");

    IEqualityComparer<TKey>? comparer = _comparer;
    uint hashCode = (uint)((comparer == null) ? key.GetHashCode() : comparer.GetHashCode(key));

    uint collisionCount = 0;
    ref int bucket = ref GetBucket(hashCode);	// hashCode取余作为新元素的buckets下标
    int i = bucket - 1; // Value in _buckets is 1-based

	// 下面的几个if else只是针对TKey类型的不同(或者有无传递comparer参数)而使用对应的EqualityComparer<TValue>去做比较,代码逻辑完全一样,所以这里只讲其中的一种情况。
    if (comparer == null)
    {
        if (typeof(TKey).IsValueType)
        {
            // ValueType: Devirtualize with EqualityComparer<TValue>.Default intrinsic
            while (true)
            {
                // Should be a while loop https://github.com/dotnet/runtime/issues/9422
                // Test uint in if rather than loop condition to drop range check for following array access
                if ((uint)i >= (uint)entries.Length)
                {	// Initialize(int capacity)初始化时,int[] buckets = new int[size],所以如果i==0,说明找不到元素
                    break;
                }

                // 查找表中是否存在相同元素
                if (entries[i].hashCode == hashCode && EqualityComparer<TKey>.Default.Equals(entries[i].key, key))
                {
                    if (behavior == InsertionBehavior.OverwriteExisting)
                    {	// 若标志位允许覆写,则覆写
                        entries[i].value = value;
                        return true;
                    }

                    if (behavior == InsertionBehavior.ThrowOnExisting)
                    {	// 若标志位不允许覆写,则抛出异常
                        ThrowHelper.ThrowAddingDuplicateWithKeyArgumentException(key);
                    }

                    return false;
                }

                i = entries[i].next;	// 检查下一个hashCode相同的元素

                collisionCount++;	// 记录哈希值冲突数
                if (collisionCount > (uint)entries.Length)
                {
                    // The chain of entries forms a loop; which means a concurrent update has happened.
                    // Break out of the loop and throw, rather than looping forever.
                    // 这两句的意思是当跳出上面的while循环时,说明有并发的update事件发生,则抛出相应的异常。这也说明Dictionary并不是线程安全的。
                    ThrowHelper.ThrowInvalidOperationException_ConcurrentOperationsNotSupported();
                }
            }
        }
        else
        {
            // ...省略代码,感兴趣去页面底部链接查看源码
        }
    }
    else
    {
            // ...省略代码,感兴趣去页面底部链接查看源码
    }

    int index;
    // _freeCount只有在Remove元素之后才可能大于0,此时将新元素插入entries空位
    if (_freeCount > 0)
    {
        index = _freeList;	// 取上次被Remove的元素的下标
        Debug.Assert((StartOfFreeList - entries[_freeList].next) >= -1, "shouldn't overflow because `next` cannot underflow");
        _freeList = StartOfFreeList - entries[_freeList].next;	// 指向上一个被Remove的元素的坐标,若没有则为StartOfFreeList-0(StartOfFreeList=-3)。这里我也没搞懂为什么要做一个偏移,在Remove方法里也有对应的操作。
        _freeCount--;	// 闲置数减一
    }
    else
    {	// 如果_entries空间不够用了,就对_entries和_buckets进行扩容
        int count = _count;
        if (count == entries.Length)
        {
            Resize();
            bucket = ref GetBucket(hashCode);
        }
        index = count;
        _count = count + 1;
        entries = _entries;
    }

    ref Entry entry = ref entries![index];
    entry.hashCode = hashCode;
    entry.next = bucket - 1; // Value in _buckets is 1-based	// [1]
    entry.key = key;
    entry.value = value;
    bucket = index + 1; // Value in _buckets is 1-based	// [2]
    // 这里标记的[1][2]两句是关键,
    // 当存储的元素作为当前hashCode的第一个元素时,entry.next = bucket - 1 = 0,bucket置为该元素的下标
    // 而后每一次存储有带着相同哈希值Key的元素时,next指向上一个带着相同哈希值Key的元素,bucket置为新元素的下标
    _version++;
    // 当对Dictionary进行添加新元素、手动扩容或最小化容量操作时,_version++
    // 扩容:调用EnsureCapacity(int capacity)且需要扩容时,_version++
    // 最小化容量:调用TrimExcess()或TrimExcess(int capacity)且可以或需要调整容量时,_version++
    // 不同于.Net Framework版本的Dictionary在“增、改、删”时,version++

    // 当TKey为非值类型的元素的collisionCount(新增元素Key的hashCode的冲突数) > HashHelpers.HashCollisionThreshold(设定的最大冲突数),
    // 并且使用NonRandomizedStringEqualityComparer类型的comparer时,会进行对表的Resize处理,
    // 并使用对应的RandomizedStringEqualityComparer(大概是性能因素)
    // Value types never rehash
    if (!typeof(TKey).IsValueType && collisionCount > HashHelpers.HashCollisionThreshold && comparer is NonRandomizedStringEqualityComparer)
    {
        // If we hit the collision threshold we'll need to switch to the comparer which is using randomized string hashing
        // i.e. EqualityComparer<string>.Default.
        Resize(entries.Length, true);
    }

    return true;
}

Resize

Resize()会在字典,也就是entries容量不够时执行,对字典进行扩展。

private void Resize() => Resize(HashHelpers.ExpandPrime(_count), false);
// HashHelpers.ExpandPrime(count)方法调用GetPrime(2 * oldSize)取大于当前元素数量2倍的最小素数作为新哈希表的长度。

private void Resize(int newSize, bool forceNewHashCodes)
{
    // Value types never rehash
    Debug.Assert(!forceNewHashCodes || !typeof(TKey).IsValueType);
    Debug.Assert(_entries != null, "_entries should be non-null");
    Debug.Assert(newSize >= _entries.Length);

    Entry[] entries = new Entry[newSize];

    int count = _count;
    Array.Copy(_entries, entries, count);	// 复制旧元素到新表

    // 如果TKey不是值类型并且使用NonRandomizedStringEqualityComparer类型的comparer,则强制重新获取hashCode
    // 使用RandomizedStringEqualityComparer类型的comparer大概是性能因素,但为什么要重新获取hashCode,暂时不明白
    if (!typeof(TKey).IsValueType && forceNewHashCodes)
    {
        Debug.Assert(_comparer is NonRandomizedStringEqualityComparer);
        _comparer = (IEqualityComparer<TKey>)((NonRandomizedStringEqualityComparer)_comparer).GetRandomizedEqualityComparer();

        for (int i = 0; i < count; i++)
        {
            if (entries[i].next >= -1)
            {
                entries[i].hashCode = (uint)_comparer.GetHashCode(entries[i].key);
            }
        }

        if (ReferenceEquals(_comparer, EqualityComparer<TKey>.Default))
        {
            _comparer = null;
        }
    }

    // Assign member variables after both arrays allocated to guard against corruption from OOM if second fails
    _buckets = new int[newSize];
    #if TARGET_64BIT
        _fastModMultiplier = HashHelpers.GetFastModMultiplier((uint)newSize);
    #endif
        // 对所有元素hashCode重新取余,重设下标
        for (int i = 0; i < count; i++)
        {
            if (entries[i].next >= -1)
            {	// 过滤Remove后的闲置位
                ref int bucket = ref GetBucket(entries[i].hashCode);
                entries[i].next = bucket - 1; // Value in _buckets is 1-based
                bucket = i + 1;
            }
        }

    _entries = entries;
}

Remove

public bool Remove(TKey key, [MaybeNullWhen(false)] out TValue value)
{
    // This overload is a copy of the overload Remove(TKey key) with one additional
    // statement to copy the value for entry being removed into the output parameter.
    // Code has been intentionally duplicated for performance reasons.

    if (key == null)
    {
        ThrowHelper.ThrowArgumentNullException(ExceptionArgument.key);
    }

    if (_buckets != null)
    {
        Debug.Assert(_entries != null, "entries should be non-null");
        uint collisionCount = 0;
        uint hashCode = (uint)(_comparer?.GetHashCode(key) ?? key.GetHashCode());
        ref int bucket = ref GetBucket(hashCode);
        Entry[]? entries = _entries;
        int last = -1;	// 记录上一个元素的下标
        int i = bucket - 1; // Value in buckets is 1-based
        while (i >= 0)	// i==0时跳出while,说明key不存在
        {
            ref Entry entry = ref entries[i];

            if (entry.hashCode == hashCode && (_comparer?.Equals(entry.key, key) ?? EqualityComparer<TKey>.Default.Equals(entry.key, key)))
            {
                if (last < 0)
                {	// last < 0 说明,该hashCode对应元素只有一个。
                    bucket = entry.next + 1; // Value in buckets is 1-based
                    // 其实就是bucket = 0
                }
                else
                {
                    entries[last].next = entry.next;	// 把前后两个entry链接起来
                }

                value = entry.value;

                Debug.Assert((StartOfFreeList - _freeList) < 0, "shouldn't underflow because max hashtable length is MaxPrimeArrayLength = 0x7FEFFFFD(2146435069) _freelist underflow threshold 2147483646");
                entry.next = StartOfFreeList - _freeList;	// next指向上一个被Remove的元素的坐标,若没有则为StartOfFreeList-1(StartOfFreeList=-3)

                if (RuntimeHelpers.IsReferenceOrContainsReferences<TKey>())
                {	// 若引用不为空,则置为默认值(“!”为null包容操作符,作用是不发出相关警告,详细请看本博客一篇介绍)
                    entry.key = default!;
                }

                if (RuntimeHelpers.IsReferenceOrContainsReferences<TValue>())
                {
                    entry.value = default!;
                }

                _freeList = i;	// 置为最新被移除元素的坐标
                _freeCount++;	// entries闲置数加一
                return true;
            }

            last = i;	// 记录上一个元素的下标
            i = entry.next;

            collisionCount++;
            if (collisionCount > (uint)entries.Length)
            {
                // The chain of entries forms a loop; which means a concurrent update has happened.
                // Break out of the loop and throw, rather than looping forever.
                // 这两句的意思是当collisionCount > (uint)entries.Length时
                // 说明有并发的update事件发生,则抛出相应的异常。这也说明Dictionary并不是线程安全的。
                ThrowHelper.ThrowInvalidOperationException_ConcurrentOperationsNotSupported();
            }
        }
    }

    value = default;
    return false;
}

Clear

为了提高效率,Clear函数没有对_entries和_buckets置null,只是把相关的变量重新初始化了。也符合逻辑,如果置null,那我为什么不直接new一个新的Dictionary呢。

public void Clear() {
    int count = _count;
    if (count > 0)
    {
        Debug.Assert(_buckets != null, "_buckets should be non-null");
        Debug.Assert(_entries != null, "_entries should be non-null");

        Array.Clear(_buckets, 0, _buckets.Length);

        _count = 0;
        _freeList = -1;
        _freeCount = 0;
        Array.Clear(_entries, 0, count);
    }
}

官方源码:Dictionary.cs

posted @ 2021-01-11 21:03  Dirt·in·firework  阅读(482)  评论(0)    收藏  举报