【源码】浅看Dictionary(.Net Core)源码
注:本文参考的源码来自System.Private.CoreLib.dll,会有部分代码逻辑不同于.Net Framework的Dictionary。如需了解可参考【源码】浅看Dictionary(mscorlib)源码
.Net Core的 Dictionary(严格来说是.Net Core整个框架)的实现使用了大量的 ref,目的是尽可能使用栈而不是堆来分配内存进而提高性能
在自定义类重写Equals()和GetHashCode()的时候有些好奇,所以想看看Dictionary查找Key和存储键值对的原理。
下面通过看看几个最常用的Dictionary的方法,了解一下Dictionary的实现原理。
GetHashCode
首先看看如果在重写Equals()的时候不重写GetHashCode()会出现什么情况
class Cat
{
public string Name;
public string Birth;
public bool Equals(Cat cat)
{
if (cat is null)
return false;
Console.WriteLine("自定义_Cat");
return Name == cat.Name && Birth == cat.Birth;
}
public override bool Equals(object obj) => Equals(obj as Cat);
}
static void Main(string[] args)
{
Dictionary<Cat, string> cats = new Dictionary<Cat, string>()
{
{ new Cat { Name="小黑", Birth = "2020-2-1" }, "小黑" },
{ new Cat { Name="小黄", Birth = "2020-4-1" }, "小黄" },
{ new Cat { Name="憨豆", Birth = "2020-5-1" }, "憨豆" },
{ new Cat { Name="小黑", Birth = "2020-2-1" }, "小黑2" },
};
var xiaohei = new Cat { Name = "小黑", Birth = "2020-2-1" };
Console.WriteLine(cats.ContainsKey(xiaohei));
}
// 输出
// False
可以看到虽然有两个“一样”的Key,{ new Cat { Name="小黑", Birth = "2020-2-1" }, "小黑" }和{ new Cat { Name="小黑", Birth = "2020-2-1" }, "小黑2" },但是Dictionary还是正常初始化了,而在ContainsKey()时找不到Key。
ContainsKey
源码中的ContainsKey()直接return!Unsafe.IsNullRef(ref FindValue(key));,再看看FindValue(TKey key)的实现
private ref TValue FindValue(TKey key)
{
if (key == null)
{
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.key);
}
ref Entry entry = ref Unsafe.NullRef<Entry>();
if (_buckets != null)
{
Debug.Assert(_entries != null, "expected entries to be != null");
IEqualityComparer<TKey>? comparer = _comparer;
// 下面的几个if else只是针对TKey类型的不同(或者有无传递comparer参数)而使用对应的EqualityComparer<TValue>去做比较,代码逻辑完全一样,所以这里只讲其中的一种情况。
if (comparer == null)
{ // 根据构造函数逻辑,如果Tkey不是string类型,且未传递comparer参数(或传递的comparer为EqualityComparer<TKey>.Default),则_comparer为null,而使用EqualityComparer<TKey>.Default
uint hashCode = (uint)key.GetHashCode();
int i = GetBucket(hashCode); // 取模获取_buckets的对应下标
Entry[]? entries = _entries;
uint collisionCount = 0;
if (typeof(TKey).IsValueType)
{ // TKey为值类型
// ValueType: Devirtualize with EqualityComparer<TValue>.Default intrinsic
i--; // Value in _buckets is 1-based; subtract 1 from i. We do it here so it fuses with the following conditional.
do
{
if ((uint)i >= (uint)entries.Length)
{ // Initialize(int capacity)初始化时,int[] buckets = new int[size],所以如果i==0,说明找不到元素
goto ReturnNotFound;
}
entry = ref entries[i];
if (entry.hashCode == hashCode && EqualityComparer<TKey>.Default.Equals(entry.key, key))
{ // 只有在hashCode相等和Equals返回true的同时才认为两个元素相同
goto ReturnFound;
}
i = entry.next; // 检查下一个hashCode相同的元素
collisionCount++; // 记录哈希值冲突数(若不理解,可到下方介绍Add方法前提供的链接了解)
} while (collisionCount <= (uint)entries.Length);
// The chain of entries forms a loop; which means a concurrent update has happened.
// Break out of the loop and throw, rather than looping forever.
// 这两句的意思是当跳出上面的while循环时,说明有并发的update事件发生,则抛出相应的异常。这也说明Dictionary并不是线程安全的。
goto ConcurrentOperation;
}
else // TKey为非值类型
{
// Object type: Shared Generic, EqualityComparer<TValue>.Default won't devirtualize
// https://github.com/dotnet/runtime/issues/10050
// So cache in a local rather than get EqualityComparer per loop iteration
EqualityComparer<TKey> defaultComparer = EqualityComparer<TKey>.Default;
// ...省略代码,感兴趣去页面底部链接查看源码
}
}
else // TKey为string类型,或已传递comparer参数
{
uint hashCode = (uint)comparer.GetHashCode(key);
// ...省略代码,感兴趣去页面底部链接查看源码
}
}
goto ReturnNotFound;
ConcurrentOperation:
ThrowHelper.ThrowInvalidOperationException_ConcurrentOperationsNotSupported();
ReturnFound:
ref TValue value = ref entry.value;
Return:
return ref value;
ReturnNotFound:
value = ref Unsafe.NullRef<TValue>();
goto Return;
}
if (entry.hashCode == hashCode && EqualityComparer
.Default.Equals(entry.key, key))
所以需要hashCode相同并且comparer.Equals(entries[i].key, key)返回true才认为两个Key一样,因为重写了Equals(),所以comparer.Equals(entries[i].key, key)返回的是true,而在没有重写GetHashCode()时,默认使用Object.GetHashCode()获取hashCode,我没有找到这个函数的实现,但基本可以确定跟对象的存储地址是有关系的,而我们是new了一个对象去找Key的,所以得到的hashCode必然跟字典中的Key的hashCode不一致,所以会出现这样的现象。
所以必须同时重写Equals()和GetHashCode(),比如在这个例子中,根据Name和Birth确定是否同一只Cat,
public override int GetHashCode() {
return Name.GetHashCode() ^ Birth.GetHashCode();
}
此时调用的是string.GetHashCode(),生成的哈希值只与string有关,只要string一样,得到的hashCode是一样的。
Dictionary的基本原理其实就是利用一个hash表对Key和Key_Value进行存储和寻址。要看懂源码先要有点哈希表相关的基础,没有数据结构基础的慢慢看也不难懂。
Add
先上图
左边的Dictionary有一个元素,entries[0].hashCode = 8,8 % buckets.Length = 1,因此,entries[0].next = buckets[1] - 1 = -1,buckets[1] = 0 + 1 = 1,实际指向entries[0]。
此时插入第二个元素,entries[1].hashCode = 15,15 % buckets.Length = 1,因此,entries[1].next = buckets[1] - 1 = 0,buckets[1] = 1 + 1 = 2,实际指向entries[1]。
带着图看源码
public void Add(TKey key, TValue value) {
bool modified = TryInsert(key, value, InsertionBehavior.ThrowOnExisting);
Debug.Assert(modified); // If there was an existing key and the Add failed, an exception will already have been thrown.
}
再看TryInsert()
这里有两个关键的变量_entries和_buckets
private Entry[]? _entries; // 按下标顺序存储每一个元素、对应的hashCode、下一个entry的下标
private int[]? _buckets; // 存储entries下标(+1)(.Net Framework里的buckets无需+1)的哈希表,用元素的hashCode对buckets.Length取余作为下标可快速找到对应的entry
private struct Entry {
public int hashCode; // Lower 31 bits of hash code, -1 if unused
public int next; // Index of next entry, -1 if last
public TKey key; // Key of entry
public TValue value; // Value of entry
}
private void TryInsert(TKey key, TValue value, InsertionBehavior behavior) {
if (key == null)
{
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.key);
}
if (_buckets == null) // 如果_buckets为null要先初始化
{
Initialize(0);
}
Debug.Assert(_buckets != null);
Entry[]? entries = _entries;
Debug.Assert(entries != null, "expected entries to be non-null");
IEqualityComparer<TKey>? comparer = _comparer;
uint hashCode = (uint)((comparer == null) ? key.GetHashCode() : comparer.GetHashCode(key));
uint collisionCount = 0;
ref int bucket = ref GetBucket(hashCode); // hashCode取余作为新元素的buckets下标
int i = bucket - 1; // Value in _buckets is 1-based
// 下面的几个if else只是针对TKey类型的不同(或者有无传递comparer参数)而使用对应的EqualityComparer<TValue>去做比较,代码逻辑完全一样,所以这里只讲其中的一种情况。
if (comparer == null)
{
if (typeof(TKey).IsValueType)
{
// ValueType: Devirtualize with EqualityComparer<TValue>.Default intrinsic
while (true)
{
// Should be a while loop https://github.com/dotnet/runtime/issues/9422
// Test uint in if rather than loop condition to drop range check for following array access
if ((uint)i >= (uint)entries.Length)
{ // Initialize(int capacity)初始化时,int[] buckets = new int[size],所以如果i==0,说明找不到元素
break;
}
// 查找表中是否存在相同元素
if (entries[i].hashCode == hashCode && EqualityComparer<TKey>.Default.Equals(entries[i].key, key))
{
if (behavior == InsertionBehavior.OverwriteExisting)
{ // 若标志位允许覆写,则覆写
entries[i].value = value;
return true;
}
if (behavior == InsertionBehavior.ThrowOnExisting)
{ // 若标志位不允许覆写,则抛出异常
ThrowHelper.ThrowAddingDuplicateWithKeyArgumentException(key);
}
return false;
}
i = entries[i].next; // 检查下一个hashCode相同的元素
collisionCount++; // 记录哈希值冲突数
if (collisionCount > (uint)entries.Length)
{
// The chain of entries forms a loop; which means a concurrent update has happened.
// Break out of the loop and throw, rather than looping forever.
// 这两句的意思是当跳出上面的while循环时,说明有并发的update事件发生,则抛出相应的异常。这也说明Dictionary并不是线程安全的。
ThrowHelper.ThrowInvalidOperationException_ConcurrentOperationsNotSupported();
}
}
}
else
{
// ...省略代码,感兴趣去页面底部链接查看源码
}
}
else
{
// ...省略代码,感兴趣去页面底部链接查看源码
}
int index;
// _freeCount只有在Remove元素之后才可能大于0,此时将新元素插入entries空位
if (_freeCount > 0)
{
index = _freeList; // 取上次被Remove的元素的下标
Debug.Assert((StartOfFreeList - entries[_freeList].next) >= -1, "shouldn't overflow because `next` cannot underflow");
_freeList = StartOfFreeList - entries[_freeList].next; // 指向上一个被Remove的元素的坐标,若没有则为StartOfFreeList-0(StartOfFreeList=-3)。这里我也没搞懂为什么要做一个偏移,在Remove方法里也有对应的操作。
_freeCount--; // 闲置数减一
}
else
{ // 如果_entries空间不够用了,就对_entries和_buckets进行扩容
int count = _count;
if (count == entries.Length)
{
Resize();
bucket = ref GetBucket(hashCode);
}
index = count;
_count = count + 1;
entries = _entries;
}
ref Entry entry = ref entries![index];
entry.hashCode = hashCode;
entry.next = bucket - 1; // Value in _buckets is 1-based // [1]
entry.key = key;
entry.value = value;
bucket = index + 1; // Value in _buckets is 1-based // [2]
// 这里标记的[1][2]两句是关键,
// 当存储的元素作为当前hashCode的第一个元素时,entry.next = bucket - 1 = 0,bucket置为该元素的下标
// 而后每一次存储有带着相同哈希值Key的元素时,next指向上一个带着相同哈希值Key的元素,bucket置为新元素的下标
_version++;
// 当对Dictionary进行添加新元素、手动扩容或最小化容量操作时,_version++
// 扩容:调用EnsureCapacity(int capacity)且需要扩容时,_version++
// 最小化容量:调用TrimExcess()或TrimExcess(int capacity)且可以或需要调整容量时,_version++
// 不同于.Net Framework版本的Dictionary在“增、改、删”时,version++
// 当TKey为非值类型的元素的collisionCount(新增元素Key的hashCode的冲突数) > HashHelpers.HashCollisionThreshold(设定的最大冲突数),
// 并且使用NonRandomizedStringEqualityComparer类型的comparer时,会进行对表的Resize处理,
// 并使用对应的RandomizedStringEqualityComparer(大概是性能因素)
// Value types never rehash
if (!typeof(TKey).IsValueType && collisionCount > HashHelpers.HashCollisionThreshold && comparer is NonRandomizedStringEqualityComparer)
{
// If we hit the collision threshold we'll need to switch to the comparer which is using randomized string hashing
// i.e. EqualityComparer<string>.Default.
Resize(entries.Length, true);
}
return true;
}
Resize
Resize()会在字典,也就是entries容量不够时执行,对字典进行扩展。
private void Resize() => Resize(HashHelpers.ExpandPrime(_count), false);
// HashHelpers.ExpandPrime(count)方法调用GetPrime(2 * oldSize)取大于当前元素数量2倍的最小素数作为新哈希表的长度。
private void Resize(int newSize, bool forceNewHashCodes)
{
// Value types never rehash
Debug.Assert(!forceNewHashCodes || !typeof(TKey).IsValueType);
Debug.Assert(_entries != null, "_entries should be non-null");
Debug.Assert(newSize >= _entries.Length);
Entry[] entries = new Entry[newSize];
int count = _count;
Array.Copy(_entries, entries, count); // 复制旧元素到新表
// 如果TKey不是值类型并且使用NonRandomizedStringEqualityComparer类型的comparer,则强制重新获取hashCode
// 使用RandomizedStringEqualityComparer类型的comparer大概是性能因素,但为什么要重新获取hashCode,暂时不明白
if (!typeof(TKey).IsValueType && forceNewHashCodes)
{
Debug.Assert(_comparer is NonRandomizedStringEqualityComparer);
_comparer = (IEqualityComparer<TKey>)((NonRandomizedStringEqualityComparer)_comparer).GetRandomizedEqualityComparer();
for (int i = 0; i < count; i++)
{
if (entries[i].next >= -1)
{
entries[i].hashCode = (uint)_comparer.GetHashCode(entries[i].key);
}
}
if (ReferenceEquals(_comparer, EqualityComparer<TKey>.Default))
{
_comparer = null;
}
}
// Assign member variables after both arrays allocated to guard against corruption from OOM if second fails
_buckets = new int[newSize];
#if TARGET_64BIT
_fastModMultiplier = HashHelpers.GetFastModMultiplier((uint)newSize);
#endif
// 对所有元素hashCode重新取余,重设下标
for (int i = 0; i < count; i++)
{
if (entries[i].next >= -1)
{ // 过滤Remove后的闲置位
ref int bucket = ref GetBucket(entries[i].hashCode);
entries[i].next = bucket - 1; // Value in _buckets is 1-based
bucket = i + 1;
}
}
_entries = entries;
}
Remove
public bool Remove(TKey key, [MaybeNullWhen(false)] out TValue value)
{
// This overload is a copy of the overload Remove(TKey key) with one additional
// statement to copy the value for entry being removed into the output parameter.
// Code has been intentionally duplicated for performance reasons.
if (key == null)
{
ThrowHelper.ThrowArgumentNullException(ExceptionArgument.key);
}
if (_buckets != null)
{
Debug.Assert(_entries != null, "entries should be non-null");
uint collisionCount = 0;
uint hashCode = (uint)(_comparer?.GetHashCode(key) ?? key.GetHashCode());
ref int bucket = ref GetBucket(hashCode);
Entry[]? entries = _entries;
int last = -1; // 记录上一个元素的下标
int i = bucket - 1; // Value in buckets is 1-based
while (i >= 0) // i==0时跳出while,说明key不存在
{
ref Entry entry = ref entries[i];
if (entry.hashCode == hashCode && (_comparer?.Equals(entry.key, key) ?? EqualityComparer<TKey>.Default.Equals(entry.key, key)))
{
if (last < 0)
{ // last < 0 说明,该hashCode对应元素只有一个。
bucket = entry.next + 1; // Value in buckets is 1-based
// 其实就是bucket = 0
}
else
{
entries[last].next = entry.next; // 把前后两个entry链接起来
}
value = entry.value;
Debug.Assert((StartOfFreeList - _freeList) < 0, "shouldn't underflow because max hashtable length is MaxPrimeArrayLength = 0x7FEFFFFD(2146435069) _freelist underflow threshold 2147483646");
entry.next = StartOfFreeList - _freeList; // next指向上一个被Remove的元素的坐标,若没有则为StartOfFreeList-1(StartOfFreeList=-3)
if (RuntimeHelpers.IsReferenceOrContainsReferences<TKey>())
{ // 若引用不为空,则置为默认值(“!”为null包容操作符,作用是不发出相关警告,详细请看本博客一篇介绍)
entry.key = default!;
}
if (RuntimeHelpers.IsReferenceOrContainsReferences<TValue>())
{
entry.value = default!;
}
_freeList = i; // 置为最新被移除元素的坐标
_freeCount++; // entries闲置数加一
return true;
}
last = i; // 记录上一个元素的下标
i = entry.next;
collisionCount++;
if (collisionCount > (uint)entries.Length)
{
// The chain of entries forms a loop; which means a concurrent update has happened.
// Break out of the loop and throw, rather than looping forever.
// 这两句的意思是当collisionCount > (uint)entries.Length时
// 说明有并发的update事件发生,则抛出相应的异常。这也说明Dictionary并不是线程安全的。
ThrowHelper.ThrowInvalidOperationException_ConcurrentOperationsNotSupported();
}
}
}
value = default;
return false;
}
Clear
为了提高效率,Clear函数没有对_entries和_buckets置null,只是把相关的变量重新初始化了。也符合逻辑,如果置null,那我为什么不直接new一个新的Dictionary呢。
public void Clear() {
int count = _count;
if (count > 0)
{
Debug.Assert(_buckets != null, "_buckets should be non-null");
Debug.Assert(_entries != null, "_entries should be non-null");
Array.Clear(_buckets, 0, _buckets.Length);
_count = 0;
_freeList = -1;
_freeCount = 0;
Array.Clear(_entries, 0, count);
}
}
官方源码:Dictionary.cs

浙公网安备 33010602011771号