LevelDB的block(block是sstable文件的存储单元)
LevelDB的block(block是sstable文件的存储单元)
blokc的存储结构:
# entry是表示一个key-value的条目
# restarts是每一轮前缀压缩时在block中的偏移量offset
# num_of_restarts是重新进行前缀压缩的次数
# trailer是block的最后5个字节,其中一个字节表示是否对block中的数据压缩,剩下4个字节表示crc校验码,trailer的5个字节不包含在block的size中
|entry_0|entry-1|...|entry_n|restarts[sizeof(uint32)*num_of_restarts]|num_of_restart(uint32)|trailer|
# 具体每个条目的组成
# shared_bytes当前key与前一个key前缀相同的字节数
# unshared_bytes当前key自己独有独有的字节数
# value_bytes值的字节数
# unshared_key_data当前key独有的数据
# value_data 当前key的值
|shared_bytes|unshared_bytes|value_bytes|unshared_key_data|value_data|
# 最后5个字节的trailer
# type表示是否对block中的数据进行压缩
# crc32位校验码
|type|crc|
代码文件:table/block.h,table/block.cc
namespace leveldb {
struct BlockContents;
class Comparator;
class Block {
public:
// Initialize the block with the specified contents.
// 用一个block的内容(真实类型为Slice)构造一个block
explicit Block(const BlockContents& contents);
Block(const Block&) = delete;
Block& operator=(const Block&) = delete;
~Block();
size_t size() const { return size_; }
// 遍历操作block中数据的迭代器
Iterator* NewIterator(const Comparator* comparator);
private:
class Iter;
// 返回重新前缀压缩的次数
uint32_t NumRestarts() const;
const char* data_;
size_t size_;
// 前缀压缩数组起始位置的偏移量offset
uint32_t restart_offset_; // Offset in data_ of restart array
bool owned_; // Block owns data_[]
};
} // namespace leveldb
namespace leveldb {
// 因为必须有一个uint32来保存前缀压缩的次数,所以需要减掉这个
inline uint32_t Block::NumRestarts() const {
assert(size_ >= sizeof(uint32_t));
return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
}
Block::Block(const BlockContents& contents)
: data_(contents.data.data()),
size_(contents.data.size()),
owned_(contents.heap_allocated) {
if (size_ < sizeof(uint32_t)) {
size_ = 0; // Error marker
} else {
// 就是用来保存前缀压缩偏移量的数组的最长的长度
size_t max_restarts_allowed = (size_ - sizeof(uint32_t)) / sizeof(uint32_t);
if (NumRestarts() > max_restarts_allowed) {
// The size is too small for NumRestarts()
size_ = 0;
} else {
// 数组起始位置的偏移量
restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t);
}
}
}
Block::~Block() {
if (owned_) {
delete[] data_;
}
}
// Helper routine: decode the next block entry starting at "p",
// storing the number of shared key bytes, non_shared key bytes,
// and the length of the value in "*shared", "*non_shared", and
// "*value_length", respectively. Will not dereference past "limit".
//
// If any errors are detected, returns nullptr. Otherwise, returns a
// pointer to the key delta (just past the three decoded values).
// 因为一个entry开始是shared_length,unshared_length和value_length,所以这个函数就是解析这三个长度,并且将p指向接下来unshared的key的部分
static inline const char* DecodeEntry(const char* p, const char* limit,
uint32_t* shared, uint32_t* non_shared,
uint32_t* value_length) {
if (limit - p < 3) return nullptr;
*shared = reinterpret_cast<const uint8_t*>(p)[0];
*non_shared = reinterpret_cast<const uint8_t*>(p)[1];
*value_length = reinterpret_cast<const uint8_t*>(p)[2];
if ((*shared | *non_shared | *value_length) < 128) {
// Fast path: all three values are encoded in one byte each
p += 3;
} else {
if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) return nullptr;
}
if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
return nullptr;
}
return p;
}
class Block::Iter : public Iterator {
private:
const Comparator* const comparator_;
const char* const data_; // underlying block contents,entry数据开始的地方
uint32_t const restarts_; // Offset of restart array (list of fixed32),第一个压缩点的偏移
uint32_t const num_restarts_; // Number of uint32_t entries in restart array,压缩次数
// current_ is offset in data_ of current entry. >= restarts_ if !Valid
uint32_t current_; // 当前所处的entry的偏移量
uint32_t restart_index_; // Index of restart block in which current_ falls当前所处的第几个压缩点
std::string key_;
Slice value_;
Status status_;
inline int Compare(const Slice& a, const Slice& b) const {
return comparator_->Compare(a, b);
}
// Return the offset in data_ just past the end of the current entry.
// 返回下一个entry的偏移量
inline uint32_t NextEntryOffset() const {
return (value_.data() + value_.size()) - data_;
}
// 返回第index个压缩点的偏移量
uint32_t GetRestartPoint(uint32_t index) {
assert(index < num_restarts_);
return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
}
// 将当前的压缩点设置为第index个压缩点
void SeekToRestartPoint(uint32_t index) {
key_.clear();
restart_index_ = index;
// current_ will be fixed by ParseNextKey();
// ParseNextKey() starts at the end of value_, so set value_ accordingly
uint32_t offset = GetRestartPoint(index);
value_ = Slice(data_ + offset, 0);
}
public:
Iter(const Comparator* comparator, const char* data, uint32_t restarts,
uint32_t num_restarts)
: comparator_(comparator),
data_(data),
restarts_(restarts),
num_restarts_(num_restarts),
current_(restarts_),
restart_index_(num_restarts_) {
assert(num_restarts_ > 0);
}
// current_是entry的偏移量,正常情况肯定会小于压缩点数组的偏移量
bool Valid() const override { return current_ < restarts_; }
Status status() const override { return status_; }
Slice key() const override {
assert(Valid());
return key_;
}
Slice value() const override {
assert(Valid());
return value_;
}
void Next() override {
assert(Valid());
ParseNextKey();
}
void Prev() override {
assert(Valid());
// Scan backwards to a restart point before current_
const uint32_t original = current_;
while (GetRestartPoint(restart_index_) >= original) {
if (restart_index_ == 0) {
// No more entries
current_ = restarts_;
restart_index_ = num_restarts_;
return;
}
restart_index_--;
}
// 上面就是定位到上一个压缩点
SeekToRestartPoint(restart_index_);
do {
// Loop until end of current entry hits the start of original entry
} while (ParseNextKey() && NextEntryOffset() < original);
// 循环找到上一个检查点的最后一个key
}
void Seek(const Slice& target) override {
// Binary search in restart array to find the last restart point
// with a key < target
uint32_t left = 0;
uint32_t right = num_restarts_ - 1;
int current_key_compare = 0;
if (Valid()) {
// If we're already scanning, use the current position as a starting
// point. This is beneficial if the key we're seeking to is ahead of the
// current position.
current_key_compare = Compare(key_, target);
if (current_key_compare < 0) {
// key_ is smaller than target
left = restart_index_;
} else if (current_key_compare > 0) {
right = restart_index_;
} else {
// We're seeking to the key we're already at.
return;
}
}
// 二分查找找到压缩点
while (left < right) {
uint32_t mid = (left + right + 1) / 2;
uint32_t region_offset = GetRestartPoint(mid);
uint32_t shared, non_shared, value_length;
const char* key_ptr =
DecodeEntry(data_ + region_offset, data_ + restarts_, &shared,
&non_shared, &value_length);
if (key_ptr == nullptr || (shared != 0)) {
CorruptionError();
return;
}
Slice mid_key(key_ptr, non_shared);
if (Compare(mid_key, target) < 0) {
// Key at "mid" is smaller than "target". Therefore all
// blocks before "mid" are uninteresting.
left = mid;
} else {
// Key at "mid" is >= "target". Therefore all blocks at or
// after "mid" are uninteresting.
right = mid - 1;
}
}
// We might be able to use our current position within the restart block.
// This is true if we determined the key we desire is in the current block
// and is after than the current key.
assert(current_key_compare == 0 || Valid());
bool skip_seek = left == restart_index_ && current_key_compare < 0;
if (!skip_seek) {
SeekToRestartPoint(left);
}
// Linear search (within restart block) for first key >= target
// 线性查找该压缩点下面的所有key
while (true) {
if (!ParseNextKey()) {
return;
}
if (Compare(key_, target) >= 0) {
return;
}
}
}
void SeekToFirst() override {
SeekToRestartPoint(0);
ParseNextKey();
}
void SeekToLast() override {
SeekToRestartPoint(num_restarts_ - 1);
while (ParseNextKey() && NextEntryOffset() < restarts_) {
// Keep skipping
}
}
private:
void CorruptionError() {
current_ = restarts_;
restart_index_ = num_restarts_;
status_ = Status::Corruption("bad entry in block");
key_.clear();
value_.clear();
}
bool ParseNextKey() {
// current_是下一个entry的偏移量,不是压缩点的偏移量
current_ = NextEntryOffset();
const char* p = data_ + current_;
// 压缩点数组的起始位置的偏移量
const char* limit = data_ + restarts_; // Restarts come right after data
if (p >= limit) {
// No more entries to return. Mark as invalid.
current_ = restarts_;
restart_index_ = num_restarts_;
return false;
}
// Decode next entry
uint32_t shared, non_shared, value_length;
p = DecodeEntry(p, limit, &shared, &non_shared, &value_length);
if (p == nullptr || key_.size() < shared) {
CorruptionError();
return false;
} else {
key_.resize(shared);
key_.append(p, non_shared);
value_ = Slice(p + non_shared, value_length);
while (restart_index_ + 1 < num_restarts_ &&
GetRestartPoint(restart_index_ + 1) < current_) {
++restart_index_;
}
return true;
}
}
};
Iterator* Block::NewIterator(const Comparator* comparator) {
if (size_ < sizeof(uint32_t)) {
return NewErrorIterator(Status::Corruption("bad block contents"));
}
const uint32_t num_restarts = NumRestarts();
if (num_restarts == 0) {
return NewEmptyIterator();
} else {
return new Iter(comparator, data_, restart_offset_, num_restarts);
}
}
} // namespace leveldb
浙公网安备 33010602011771号