LevelDB的block(block是sstable文件的存储单元)
LevelDB的block(block是sstable文件的存储单元)
blokc的存储结构:
# entry是表示一个key-value的条目
# restarts是每一轮前缀压缩时在block中的偏移量offset
# num_of_restarts是重新进行前缀压缩的次数
# trailer是block的最后5个字节,其中一个字节表示是否对block中的数据压缩,剩下4个字节表示crc校验码,trailer的5个字节不包含在block的size中
|entry_0|entry-1|...|entry_n|restarts[sizeof(uint32)*num_of_restarts]|num_of_restart(uint32)|trailer|
# 具体每个条目的组成
# shared_bytes当前key与前一个key前缀相同的字节数
# unshared_bytes当前key自己独有独有的字节数
# value_bytes值的字节数
# unshared_key_data当前key独有的数据
# value_data 当前key的值
|shared_bytes|unshared_bytes|value_bytes|unshared_key_data|value_data|
# 最后5个字节的trailer
# type表示是否对block中的数据进行压缩
# crc32位校验码
|type|crc|
代码文件:table/block.h,table/block.cc
namespace leveldb {
struct BlockContents;
class Comparator;
class Block {
 public:
  // Initialize the block with the specified contents.
  // 用一个block的内容(真实类型为Slice)构造一个block
  explicit Block(const BlockContents& contents);
  Block(const Block&) = delete;
  Block& operator=(const Block&) = delete;
  ~Block();
  size_t size() const { return size_; }
  // 遍历操作block中数据的迭代器
  Iterator* NewIterator(const Comparator* comparator);
 private:
  class Iter;
  // 返回重新前缀压缩的次数
  uint32_t NumRestarts() const;
  const char* data_;
  size_t size_;
  // 前缀压缩数组起始位置的偏移量offset
  uint32_t restart_offset_;  // Offset in data_ of restart array
  bool owned_;               // Block owns data_[]
};
}  // namespace leveldb
namespace leveldb {
// 因为必须有一个uint32来保存前缀压缩的次数,所以需要减掉这个
inline uint32_t Block::NumRestarts() const {
  assert(size_ >= sizeof(uint32_t));
  return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
}
Block::Block(const BlockContents& contents)
    : data_(contents.data.data()),
      size_(contents.data.size()),
      owned_(contents.heap_allocated) {
  if (size_ < sizeof(uint32_t)) {
    size_ = 0;  // Error marker
  } else {
    // 就是用来保存前缀压缩偏移量的数组的最长的长度
    size_t max_restarts_allowed = (size_ - sizeof(uint32_t)) / sizeof(uint32_t);
    if (NumRestarts() > max_restarts_allowed) {
      // The size is too small for NumRestarts()
      size_ = 0;
    } else {
      // 数组起始位置的偏移量
      restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t);
    }
  }
}
Block::~Block() {
  if (owned_) {
    delete[] data_;
  }
}
// Helper routine: decode the next block entry starting at "p",
// storing the number of shared key bytes, non_shared key bytes,
// and the length of the value in "*shared", "*non_shared", and
// "*value_length", respectively.  Will not dereference past "limit".
//
// If any errors are detected, returns nullptr.  Otherwise, returns a
// pointer to the key delta (just past the three decoded values).
// 因为一个entry开始是shared_length,unshared_length和value_length,所以这个函数就是解析这三个长度,并且将p指向接下来unshared的key的部分
static inline const char* DecodeEntry(const char* p, const char* limit,
                                      uint32_t* shared, uint32_t* non_shared,
                                      uint32_t* value_length) {
  if (limit - p < 3) return nullptr;
  *shared = reinterpret_cast<const uint8_t*>(p)[0];
  *non_shared = reinterpret_cast<const uint8_t*>(p)[1];
  *value_length = reinterpret_cast<const uint8_t*>(p)[2];
  if ((*shared | *non_shared | *value_length) < 128) {
    // Fast path: all three values are encoded in one byte each
    p += 3;
  } else {
    if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
    if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
    if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) return nullptr;
  }
  if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
    return nullptr;
  }
  return p;
}
class Block::Iter : public Iterator {
 private:
  const Comparator* const comparator_;
  const char* const data_;       // underlying block contents,entry数据开始的地方
  uint32_t const restarts_;      // Offset of restart array (list of fixed32),第一个压缩点的偏移
  uint32_t const num_restarts_;  // Number of uint32_t entries in restart array,压缩次数
  // current_ is offset in data_ of current entry.  >= restarts_ if !Valid
  uint32_t current_; // 当前所处的entry的偏移量
  uint32_t restart_index_;  // Index of restart block in which current_ falls当前所处的第几个压缩点
  std::string key_;
  Slice value_;
  Status status_;
  inline int Compare(const Slice& a, const Slice& b) const {
    return comparator_->Compare(a, b);
  }
  // Return the offset in data_ just past the end of the current entry.
  // 返回下一个entry的偏移量
  inline uint32_t NextEntryOffset() const {
    return (value_.data() + value_.size()) - data_;
  }
  // 返回第index个压缩点的偏移量
  uint32_t GetRestartPoint(uint32_t index) {
    assert(index < num_restarts_);
    return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
  }
  // 将当前的压缩点设置为第index个压缩点
  void SeekToRestartPoint(uint32_t index) {
    key_.clear();
    restart_index_ = index;
    // current_ will be fixed by ParseNextKey();
    // ParseNextKey() starts at the end of value_, so set value_ accordingly
    uint32_t offset = GetRestartPoint(index);
    value_ = Slice(data_ + offset, 0);
  }
 public:
  Iter(const Comparator* comparator, const char* data, uint32_t restarts,
       uint32_t num_restarts)
      : comparator_(comparator),
        data_(data),
        restarts_(restarts),
        num_restarts_(num_restarts),
        current_(restarts_),
        restart_index_(num_restarts_) {
    assert(num_restarts_ > 0);
  }
  // current_是entry的偏移量,正常情况肯定会小于压缩点数组的偏移量
  bool Valid() const override { return current_ < restarts_; }
  Status status() const override { return status_; }
  Slice key() const override {
    assert(Valid());
    return key_;
  }
  Slice value() const override {
    assert(Valid());
    return value_;
  }
  void Next() override {
    assert(Valid());
    ParseNextKey();
  }
  void Prev() override {
    assert(Valid());
    // Scan backwards to a restart point before current_
    const uint32_t original = current_;
    while (GetRestartPoint(restart_index_) >= original) {
      if (restart_index_ == 0) {
        // No more entries
        current_ = restarts_;
        restart_index_ = num_restarts_;
        return;
      }
      restart_index_--;
    }
    // 上面就是定位到上一个压缩点
    SeekToRestartPoint(restart_index_);
    do {
      // Loop until end of current entry hits the start of original entry
    } while (ParseNextKey() && NextEntryOffset() < original);
    // 循环找到上一个检查点的最后一个key
  }
  void Seek(const Slice& target) override {
    // Binary search in restart array to find the last restart point
    // with a key < target
    uint32_t left = 0;
    uint32_t right = num_restarts_ - 1;
    int current_key_compare = 0;
    if (Valid()) {
      // If we're already scanning, use the current position as a starting
      // point. This is beneficial if the key we're seeking to is ahead of the
      // current position.
      current_key_compare = Compare(key_, target);
      if (current_key_compare < 0) {
        // key_ is smaller than target
        left = restart_index_;
      } else if (current_key_compare > 0) {
        right = restart_index_;
      } else {
        // We're seeking to the key we're already at.
        return;
      }
    }
    // 二分查找找到压缩点
    while (left < right) {
      uint32_t mid = (left + right + 1) / 2;
      uint32_t region_offset = GetRestartPoint(mid);
      uint32_t shared, non_shared, value_length;
      const char* key_ptr =
          DecodeEntry(data_ + region_offset, data_ + restarts_, &shared,
                      &non_shared, &value_length);
      if (key_ptr == nullptr || (shared != 0)) {
        CorruptionError();
        return;
      }
      Slice mid_key(key_ptr, non_shared);
      if (Compare(mid_key, target) < 0) {
        // Key at "mid" is smaller than "target".  Therefore all
        // blocks before "mid" are uninteresting.
        left = mid;
      } else {
        // Key at "mid" is >= "target".  Therefore all blocks at or
        // after "mid" are uninteresting.
        right = mid - 1;
      }
    }
    // We might be able to use our current position within the restart block.
    // This is true if we determined the key we desire is in the current block
    // and is after than the current key.
    assert(current_key_compare == 0 || Valid());
    bool skip_seek = left == restart_index_ && current_key_compare < 0;
    if (!skip_seek) {
      SeekToRestartPoint(left);
    }
    // Linear search (within restart block) for first key >= target
    // 线性查找该压缩点下面的所有key
    while (true) {
      if (!ParseNextKey()) {
        return;
      }
      if (Compare(key_, target) >= 0) {
        return;
      }
    }
  }
  void SeekToFirst() override {
    SeekToRestartPoint(0);
    ParseNextKey();
  }
  void SeekToLast() override {
    SeekToRestartPoint(num_restarts_ - 1);
    while (ParseNextKey() && NextEntryOffset() < restarts_) {
      // Keep skipping
    }
  }
 private:
  void CorruptionError() {
    current_ = restarts_;
    restart_index_ = num_restarts_;
    status_ = Status::Corruption("bad entry in block");
    key_.clear();
    value_.clear();
  }
  bool ParseNextKey() {
    // current_是下一个entry的偏移量,不是压缩点的偏移量
    current_ = NextEntryOffset();
    const char* p = data_ + current_;
    // 压缩点数组的起始位置的偏移量
    const char* limit = data_ + restarts_;  // Restarts come right after data
    if (p >= limit) {
      // No more entries to return.  Mark as invalid.
      current_ = restarts_;
      restart_index_ = num_restarts_;
      return false;
    }
    // Decode next entry
    uint32_t shared, non_shared, value_length;
    p = DecodeEntry(p, limit, &shared, &non_shared, &value_length);
    if (p == nullptr || key_.size() < shared) {
      CorruptionError();
      return false;
    } else {
      key_.resize(shared);
      key_.append(p, non_shared);
      value_ = Slice(p + non_shared, value_length);
      while (restart_index_ + 1 < num_restarts_ &&
             GetRestartPoint(restart_index_ + 1) < current_) {
        ++restart_index_;
      }
      return true;
    }
  }
};
Iterator* Block::NewIterator(const Comparator* comparator) {
  if (size_ < sizeof(uint32_t)) {
    return NewErrorIterator(Status::Corruption("bad block contents"));
  }
  const uint32_t num_restarts = NumRestarts();
  if (num_restarts == 0) {
    return NewEmptyIterator();
  } else {
    return new Iter(comparator, data_, restart_offset_, num_restarts);
  }
}
}  // namespace leveldb
 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号