InnoDB 中的锁实现

InnoDB 中的锁实现

原贴:InnoDB 锁系统及死锁检测实现分析

InnoDB 中,所有事务加的行锁通过一个全局的 hash 表 lock_sys 维护:

/* The lock system */
lock_sys_t *lock_sys = NULL;
/** The lock system struct */
struct lock_sys_t {
  char pad1[INNOBASE_CACHE_LINE_SIZE];
  /*!< padding to prevent other
  memory update hotspots from
  residing on the same memory
  cache line */
  LockMutex mutex;              /*!< Mutex protecting the
                                locks */
  hash_table_t *rec_hash;       /*!< hash table of the record
                                locks */
  hash_table_t *prdt_hash;      /*!< hash table of the predicate
                                lock */
  hash_table_t *prdt_page_hash; /*!< hash table of the page
                                lock */

  char pad2[INNOBASE_CACHE_LINE_SIZE]; /*!< Padding */
  LockMutex wait_mutex;                /*!< Mutex protecting the
                                       next two fields */
  srv_slot_t *waiting_threads;         /*!< Array  of user threads
                                       suspended while waiting for
                                       locks within InnoDB, protected
                                       by the lock_sys->wait_mutex */
  srv_slot_t *last_slot;               /*!< highest slot ever used
                                       in the waiting_threads array,
                                       protected by
                                       lock_sys->wait_mutex */
  int n_waiting;                       /*!< Number of slots in use.
                                       Protected by lock_sys->mutex */
  ibool rollback_complete;
  /*!< TRUE if rollback of all
  recovered transactions is
  complete. Protected by
  lock_sys->mutex */

  ulint n_lock_max_wait_time; /*!< Max wait time */

  os_event_t timeout_event; /*!< Set to the event that is
                            created in the lock wait monitor
                            thread. A value of 0 means the
                            thread is not active */

  /** Marker value before trx_t::age. */
  uint64_t mark_age_updated;

#ifdef UNIV_DEBUG
  /** Lock timestamp counter */
  uint64_t m_seq;
#endif /* UNIV_DEBUG */
};

在进一步介绍行锁实现之前,先介绍一下InnoDB中事务与锁相关的数据结构

struct trx_t {
  enum isolation_level_t {

    /** dirty read: non-locking SELECTs are performed so that we
    do not look at a possible earlier version of a record; thus
    they are not 'consistent' reads under this isolation level;
    otherwise like level 2 */
    READ_UNCOMMITTED,

    /** somewhat Oracle-like isolation, except that in range UPDATE
    and DELETE we must block phantom rows with next-key locks;
    SELECT ... FOR UPDATE and ...  LOCK IN SHARE MODE only lock
    the index records, NOT the gaps before them, and thus allow
    free inserting; each consistent read reads its own snapshot */
    READ_COMMITTED,

    /** this is the default; all consistent reads in the same trx
    read the same snapshot; full next-key locking used in locking
    reads to block insertions into gaps */
    REPEATABLE_READ,

    /** all plain SELECTs are converted to LOCK IN SHARE MODE
    reads */
    SERIALIZABLE
  };

  TrxMutex mutex; /*!< Mutex protecting the fields
                  state and lock (except some fields
                  of lock, which are protected by
                  lock_sys->mutex) */

  bool owns_mutex; /*!< Set to the transaction that owns
                   the mutex during lock acquire and/or
                   release.

                   This is used to avoid taking the
                   trx_t::mutex recursively. */

  /* Note: in_depth was split from in_innodb for fixing a RO
  performance issue. Acquiring the trx_t::mutex for each row
  costs ~3% in performance. It is not required for correctness.
  Therefore we increment/decrement in_depth without holding any
  mutex. The assumption is that the Server will only ever call
  the handler from one thread. This is not true for kill_connection.
  Therefore in innobase_kill_connection. We don't increment this
  counter via TrxInInnoDB. */

  ib_uint32_t in_depth; /*!< Track nested TrxInInnoDB
                        count */

  ib_uint32_t in_innodb; /*!< if the thread is executing
                         in the InnoDB context count > 0. */

  bool abort; /*!< if this flag is set then
              this transaction must abort when
              it can */

  trx_id_t id; /*!< transaction id */

  trx_id_t no; /*!< transaction serialization number:
               max trx id shortly before the
               transaction is moved to
               COMMITTED_IN_MEMORY state.
               Protected by trx_sys_t::mutex
               when trx->in_rw_trx_list. Initially
               set to TRX_ID_MAX. */

  /** State of the trx from the point of view of concurrency control
  and the valid state transitions.

  Possible states:

  TRX_STATE_NOT_STARTED
  TRX_STATE_FORCED_ROLLBACK
  TRX_STATE_ACTIVE
  TRX_STATE_PREPARED
  TRX_STATE_COMMITTED_IN_MEMORY (alias below COMMITTED)

  Valid state transitions are:

  Regular transactions:
  * NOT_STARTED -> ACTIVE -> COMMITTED -> NOT_STARTED

  Auto-commit non-locking read-only:
  * NOT_STARTED -> ACTIVE -> NOT_STARTED

  XA (2PC):
  * NOT_STARTED -> ACTIVE -> PREPARED -> COMMITTED -> NOT_STARTED

  Recovered XA:
  * NOT_STARTED -> PREPARED -> COMMITTED -> (freed)

  XA (2PC) (shutdown or disconnect before ROLLBACK or COMMIT):
  * NOT_STARTED -> PREPARED -> (freed)

  Disconnected XA can become recovered:
  * ... -> ACTIVE -> PREPARED (connected) -> PREPARED (disconnected)
  Disconnected means from mysql e.g due to the mysql client disconnection.
  Latching and various transaction lists membership rules:

  XA (2PC) transactions are always treated as non-autocommit.

  Transitions to ACTIVE or NOT_STARTED occur when
  !in_rw_trx_list (no trx_sys->mutex needed).

  Autocommit non-locking read-only transactions move between states
  without holding any mutex. They are !in_rw_trx_list.

  All transactions, unless they are determined to be ac-nl-ro,
  explicitly tagged as read-only or read-write, will first be put
  on the read-only transaction list. Only when a !read-only transaction
  in the read-only list tries to acquire an X or IX lock on a table
  do we remove it from the read-only list and put it on the read-write
  list. During this switch we assign it a rollback segment.

  When a transaction is NOT_STARTED, it can be in_mysql_trx_list if
  it is a user transaction. It cannot be in rw_trx_list.

  ACTIVE->PREPARED->COMMITTED is only possible when trx->in_rw_trx_list.
  The transition ACTIVE->PREPARED is protected by trx_sys->mutex.

  ACTIVE->COMMITTED is possible when the transaction is in
  rw_trx_list.

  Transitions to COMMITTED are protected by both lock_sys->mutex
  and trx->mutex.

  NOTE: Some of these state change constraints are an overkill,
  currently only required for a consistent view for printing stats.
  This unnecessarily adds a huge cost for the general case. */

  trx_state_t state;

  /* If set, this transaction should stop inheriting (GAP)locks.
  Generally set to true during transaction prepare for RC or lower
  isolation, if requested. Needed for replication replay where
  we don't want to get blocked on GAP locks taken for protecting
  concurrent unique insert or replace operation. */
  bool skip_lock_inheritance;

  ReadView *read_view; /*!< consistent read view used in the
                       transaction, or NULL if not yet set */

  UT_LIST_NODE_T(trx_t)
  trx_list; /*!< list of transactions;
            protected by trx_sys->mutex. */
  UT_LIST_NODE_T(trx_t)
  no_list; /*!< Required during view creation
           to check for the view limit for
           transactions that are committing */

  trx_lock_t lock;   /*!< Information about the transaction
                     locks and state. Protected by
                     trx->mutex or lock_sys->mutex
                     or both */
  bool is_recovered; /*!< 0=normal transaction,
                     1=recovered, must be rolled back,
                     protected by trx_sys->mutex when
                     trx->in_rw_trx_list holds */

  hit_list_t hit_list; /*!< List of transactions to kill,
                       when a high priority transaction
                       is blocked on a lock wait. */

  os_thread_id_t killed_by; /*!< The thread ID that wants to
                            kill this transaction asynchronously.
                            This is required because we recursively
                            enter the handlerton methods and need
                            to distinguish between the kill thread
                            and the transaction thread.

                            Note: We need to be careful w.r.t the
                            Thread Pool. The thread doing the kill
                            should not leave InnoDB between the
                            mark and the actual async kill because
                            the running thread can change. */

  /* These fields are not protected by any mutex. */
  const char *op_info;   /*!< English text describing the
                         current operation, or an empty
                         string */
  ulint isolation_level; /*!< TRX_ISO_REPEATABLE_READ, ... */
  bool check_foreigns;   /*!< normally TRUE, but if the user
                         wants to suppress foreign key checks,
                         (in table imports, for example) we
                         set this FALSE */
  /*------------------------------*/
  /* MySQL has a transaction coordinator to coordinate two phase
  commit between multiple storage engines and the binary log. When
  an engine participates in a transaction, it's responsible for
  registering itself using the trans_register_ha() API. */
  bool is_registered; /* This flag is set to true after the
                      transaction has been registered with
                      the coordinator using the XA API, and
                      is set to false  after commit or
                      rollback. */
  /*------------------------------*/
  bool check_unique_secondary;
  /*!< normally TRUE, but if the user
  wants to speed up inserts by
  suppressing unique key checks
  for secondary indexes when we decide
  if we can use the insert buffer for
  them, we set this FALSE */
  bool flush_log_later;      /* In 2PC, we hold the
                             prepare_commit mutex across
                             both phases. In that case, we
                             defer flush of the logs to disk
                             until after we release the
                             mutex. */
  bool must_flush_log_later; /*!< this flag is set to TRUE in
                        trx_commit() if flush_log_later was
                        TRUE, and there were modifications by
                        the transaction; in that case we must
                        flush the log in
                        trx_commit_complete_for_mysql() */
  ulint duplicates;          /*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */
  bool has_search_latch;
  /*!< TRUE if this trx has latched the
  search system latch in S-mode.
  This now can only be true in
  row_search_mvcc, the btr search latch
  must has been released before exiting,
  and this flag would be set to false */
  trx_dict_op_t dict_operation; /**< @see enum trx_dict_op_t */

  bool ddl_operation;  /*!< True if this trx involves dd table
                        change */
  bool ddl_must_flush; /*!< True if this trx involves dd table
                       change, and must flush */
  bool in_truncate;    /* This trx is doing truncation */

  /* Fields protected by the srv_conc_mutex. */
  bool declared_to_be_inside_innodb;
  /*!< this is TRUE if we have declared
  this transaction in
  srv_conc_enter_innodb to be inside the
  InnoDB engine */
  ib_uint32_t n_tickets_to_enter_innodb;
  /*!< this can be > 0 only when
  declared_to_... is TRUE; when we come
  to srv_conc_innodb_enter, if the value
  here is > 0, we decrement this by 1 */
  ib_uint32_t dict_operation_lock_mode;
  /*!< 0, RW_S_LATCH, or RW_X_LATCH:
  the latch mode trx currently holds
  on dict_operation_lock. Protected
  by dict_operation_lock. */

  time_t start_time; /*!< time the state last time became
                     TRX_STATE_ACTIVE */

  /** Weight/Age of the transaction in the record lock wait queue. */
  int32_t age;

  /** For tracking if Weight/age has been updated. */
  uint64_t age_updated;

  lsn_t commit_lsn; /*!< lsn at the time of the commit */

  /*------------------------------*/
  THD *mysql_thd; /*!< MySQL thread handle corresponding
                  to this trx, or NULL */

  const char *mysql_log_file_name;
  /*!< if MySQL binlog is used, this field
  contains a pointer to the latest file
  name; this is NULL if binlog is not
  used */
  int64_t mysql_log_offset;
  /*!< if MySQL binlog is used, this
  field contains the end offset of the
  binlog entry */
  /*------------------------------*/
  ib_uint32_t n_mysql_tables_in_use; /*!< number of Innobase tables
                              used in the processing of the current
                              SQL statement in MySQL */
  ib_uint32_t mysql_n_tables_locked;
  /*!< how many tables the current SQL
  statement uses, except those
  in consistent read */
  /*------------------------------*/
#ifdef UNIV_DEBUG
  /** The following two fields are mutually exclusive. */
  /* @{ */

  bool in_rw_trx_list; /*!< true if in trx_sys->rw_trx_list */
                       /* @} */
#endif                 /* UNIV_DEBUG */
  UT_LIST_NODE_T(trx_t)
  mysql_trx_list; /*!< list of transactions created for
                  MySQL; protected by trx_sys->mutex */
#ifdef UNIV_DEBUG
  bool in_mysql_trx_list;
  /*!< true if in
  trx_sys->mysql_trx_list */
#endif /* UNIV_DEBUG */
  /*------------------------------*/
  dberr_t error_state;             /*!< 0 if no error, otherwise error
                                   number; NOTE That ONLY the thread
                                   doing the transaction is allowed to
                                   set this field: this is NOT protected
                                   by any mutex */
  const dict_index_t *error_index; /*!< if the error number indicates a
                                   duplicate key error, a pointer to
                                   the problematic index is stored here */
  ulint error_key_num;             /*!< if the index creation fails to a
                                   duplicate key error, a mysql key
                                   number of that index is stored here */
  sess_t *sess;                    /*!< session of the trx, NULL if none */
  que_t *graph;                    /*!< query currently run in the session,
                                   or NULL if none; NOTE that the query
                                   belongs to the session, and it can
                                   survive over a transaction commit, if
                                   it is a stored procedure with a COMMIT
                                   WORK statement, for instance */
  /*------------------------------*/
  UT_LIST_BASE_NODE_T(trx_named_savept_t)
  trx_savepoints; /*!< savepoints set with SAVEPOINT ...,
                  oldest first */
  /*------------------------------*/
  UndoMutex undo_mutex; /*!< mutex protecting the fields in this
                        section (down to undo_no_arr), EXCEPT
                        last_sql_stat_start, which can be
                        accessed only when we know that there
                        cannot be any activity in the undo
                        logs! */
  undo_no_t undo_no;    /*!< next undo log record number to
                        assign; since the undo log is
                        private for a transaction, this
                        is a simple ascending sequence
                        with no gaps; thus it represents
                        the number of modified/inserted
                        rows in a transaction */
  space_id_t undo_rseg_space;
  /*!< space id where last undo record
  was written */
  trx_savept_t last_sql_stat_start;
  /*!< undo_no when the last sql statement
  was started: in case of an error, trx
  is rolled back down to this undo
  number; see note at undo_mutex! */
  trx_rsegs_t rsegs;    /* rollback segments for undo logging */
  undo_no_t roll_limit; /*!< least undo number to undo during
                        a partial rollback; 0 otherwise */
#ifdef UNIV_DEBUG
  bool in_rollback;   /*!< true when the transaction is
                      executing a partial or full rollback */
#endif                /* UNIV_DEBUG */
  ulint pages_undone; /*!< number of undo log pages undone
                      since the last undo log truncation */
  /*------------------------------*/
  ulint n_autoinc_rows;       /*!< no. of AUTO-INC rows required for
                              an SQL statement. This is useful for
                              multi-row INSERTs */
  ib_vector_t *autoinc_locks; /* AUTOINC locks held by this
                              transaction. Note that these are
                              also in the lock list trx_locks. This
                              vector needs to be freed explicitly
                              when the trx instance is destroyed.
                              Protected by lock_sys->mutex. */
  /*------------------------------*/
  bool read_only;        /*!< true if transaction is flagged
                         as a READ-ONLY transaction.
                         if auto_commit && will_lock == 0
                         then it will be handled as a
                         AC-NL-RO-SELECT (Auto Commit Non-Locking
                         Read Only Select). A read only
                         transaction will not be assigned an
                         UNDO log. */
  bool auto_commit;      /*!< true if it is an autocommit */
  ib_uint32_t will_lock; /*!< Will acquire some locks. Increment
                         each time we determine that a lock will
                         be acquired by the MySQL layer. */
#ifndef UNIV_HOTBACKUP
  /*------------------------------*/
  fts_trx_t *fts_trx;       /*!< FTS information, or NULL if
                            transaction hasn't modified tables
                            with FTS indexes (yet). */
  doc_id_t fts_next_doc_id; /* The document id used for updates */
  /*------------------------------*/
  ib_uint32_t flush_tables; /*!< if "covering" the FLUSH TABLES",
                            count of tables being flushed. */

  /*------------------------------*/
  bool internal; /*!< true if it is a system/internal
                 transaction background task. Such
                 transactions are always treated as
                 read-write. */
                 /*------------------------------*/
#ifdef UNIV_DEBUG
  ulint start_line;       /*!< Track where it was started from */
  const char *start_file; /*!< Filename where it was started */
#endif                    /* UNIV_DEBUG */

  lint n_ref; /*!< Count of references, protected
              by trx_t::mutex. We can't release the
              locks nor commit the transaction until
              this reference is 0.  We can change
              the state to COMMITTED_IN_MEMORY to
              signify that it is no longer
              "active". */

  /** Version of this instance. It is incremented each time the
  instance is re-used in trx_start_low(). It is used to track
  whether a transaction has been restarted since it was tagged
  for asynchronous rollback. */
  ulint version;

  XID *xid;                    /*!< X/Open XA transaction
                               identification to identify a
                               transaction branch */
  trx_mod_tables_t mod_tables; /*!< List of tables that were modified
                               by this transaction */
#endif                         /* !UNIV_HOTBACKUP */
                               /*------------------------------*/
  bool api_trx;                /*!< trx started by InnoDB API */
  bool api_auto_commit;        /*!< automatic commit */
  bool read_write;             /*!< if read and write operation */

  /*------------------------------*/
  char *detailed_error;          /*!< detailed error message for last
                                 error, or empty. */
  FlushObserver *flush_observer; /*!< flush observer */

#ifdef UNIV_DEBUG
  bool is_dd_trx; /*!< True if the transaction is used for
                  doing Non-locking Read-only Read
                  Committed on DD tables */
#endif            /* UNIV_DEBUG */
  ulint magic_n;

  bool is_read_uncommitted() const {
    return (isolation_level == READ_UNCOMMITTED);
  }

  bool skip_gap_locks() const {
    switch (isolation_level) {
      case READ_UNCOMMITTED:
      case READ_COMMITTED:
        return (true);
      case REPEATABLE_READ:
      case SERIALIZABLE:
        return (false);
    }
    ut_ad(0);
    return (false);
  }

  bool allow_semi_consistent() const { return (skip_gap_locks()); }
};

上面太长不看,简化如下:

struct trx_t {
	...
	trx_lock_t lock;   /*!< Information about the transaction
                     locks and state. Protected by
                     trx->mutex or lock_sys->mutex
                     or both */
	...
}

在 8.0 中,将 lock 单独从 trx_t 中解耦出来。观察 trx_lock_t 的定义。

/** Latching protocol for trx_lock_t::que_state.  trx_lock_t::que_state
 captures the state of the query thread during the execution of a query.
 This is different from a transaction state. The query state of a transaction
 can be updated asynchronously by other threads.  The other threads can be
 system threads, like the timeout monitor thread or user threads executing
 other queries. Another thing to be mindful of is that there is a delay between
 when a query thread is put into LOCK_WAIT state and before it actually starts
 waiting.  Between these two events it is possible that the query thread is
 granted the lock it was waiting for, which implies that the state can be
 changed asynchronously.

 All these operations take place within the context of locking. Therefore state
 changes within the locking code must acquire both the lock mutex and the
 trx->mutex when changing trx->lock.que_state to TRX_QUE_LOCK_WAIT or
 trx->lock.wait_lock to non-NULL but when the lock wait ends it is sufficient
 to only acquire the trx->mutex.
 To query the state either of the mutexes is sufficient within the locking
 code and no mutex is required when the query thread is no longer waiting. */

/** The locks and state of an active transaction. Protected by
lock_sys->mutex, trx->mutex or both. */
struct trx_lock_t {
  ulint n_active_thrs; /*!< number of active query threads */

  trx_que_t que_state; /*!< valid when trx->state
                       == TRX_STATE_ACTIVE: TRX_QUE_RUNNING,
                       TRX_QUE_LOCK_WAIT, ... */

  lock_t *wait_lock;         /*!< if trx execution state is
                             TRX_QUE_LOCK_WAIT, this points to
                             the lock request, otherwise this is
                             NULL; set to non-NULL when holding
                             both trx->mutex and lock_sys->mutex;
                             set to NULL when holding
                             lock_sys->mutex; readers should
                             hold lock_sys->mutex, except when
                             they are holding trx->mutex and
                             wait_lock==NULL */
  ib_uint64_t deadlock_mark; /*!< A mark field that is initialized
                             to and checked against lock_mark_counter
                             by lock_deadlock_recursive(). */
  bool was_chosen_as_deadlock_victim;
  /*!< when the transaction decides to
  wait for a lock, it sets this to false;
  if another transaction chooses this
  transaction as a victim in deadlock
  resolution, it sets this to true.
  Protected by trx->mutex. */
  time_t wait_started; /*!< lock wait started at this time,
                       protected only by lock_sys->mutex */

  que_thr_t *wait_thr; /*!< query thread belonging to this
                       trx that is in QUE_THR_LOCK_WAIT
                       state. For threads suspended in a
                       lock wait, this is protected by
                       lock_sys->mutex. Otherwise, this may
                       only be modified by the thread that is
                       serving the running transaction. */

  lock_pool_t rec_pool; /*!< Pre-allocated record locks */

  lock_pool_t table_pool; /*!< Pre-allocated table locks */

  ulint rec_cached; /*!< Next free rec lock in pool */

  ulint table_cached; /*!< Next free table lock in pool */

  mem_heap_t *lock_heap; /*!< memory heap for trx_locks;
                         protected by lock_sys->mutex */

  trx_lock_list_t trx_locks; /*!< locks requested by the transaction;
                             insertions are protected by trx->mutex
                             and lock_sys->mutex; removals are
                             protected by lock_sys->mutex */

  lock_pool_t table_locks; /*!< All table locks requested by this
                           transaction, including AUTOINC locks */

  ulint n_rec_locks; /*!< number of rec locks in this trx */
#ifdef UNIV_DEBUG
  /** When a transaction is forced to rollback due to a deadlock
  check or by another high priority transaction this is true. Used
  by debug checks in lock0lock.cc */
  bool in_rollback;
#endif /* UNIV_DEBUG */

  /** The transaction called ha_innobase::start_stmt() to
  lock a table. Most likely a temporary table. */
  bool start_stmt;
};
typedef UT_LIST_BASE_NODE_T(lock_t) trx_lock_list_t;

trx_lock_list_t trx_locks; /*!< locks requested by the transaction;
                             insertions are protected by trx->mutex
                             and lock_sys->mutex; removals are
                             protected by lock_sys->mutex */

每个事务都会维护执行过程中加的所有锁( trx_locks,双向链表),以及发生锁等待时等待的锁( wait_lock

lock_sys 中保存的对象为 lock_t ,它可以描述一个行锁或者表锁,如果是行锁,会被 lock_sys 维护,若是表锁,会被对应的 dict_table_t 维护

/** Lock struct; protected by lock_sys->mutex */
struct lock_t {
  /** transaction owning the lock */
  trx_t *trx;

  /** list of the locks of the transaction */
  UT_LIST_NODE_T(lock_t) trx_locks;

  /** Index for a record lock */
  dict_index_t *index;

  /** Hash chain node for a record lock. The link node in a singly
  linked list, used by the hash table. */
  lock_t *hash;

  union {
    /** Table lock */
    lock_table_t tab_lock;

    /** Record lock */
    lock_rec_t rec_lock;
  };

#ifdef HAVE_PSI_THREAD_INTERFACE
#ifdef HAVE_PSI_DATA_LOCK_INTERFACE
  /** Performance schema thread that created the lock. */
  ulonglong m_psi_internal_thread_id;

  /** Performance schema event that created the lock. */
  ulonglong m_psi_event_id;
#endif /* HAVE_PSI_DATA_LOCK_INTERFACE */
#endif /* HAVE_PSI_THREAD_INTERFACE */

  /** The lock type and mode bit flags.
  LOCK_GAP or LOCK_REC_NOT_GAP, LOCK_INSERT_INTENTION, wait flag, ORed */
  uint32_t type_mode;

#if defined(UNIV_DEBUG)
  /** Timestamp when it was created. */
  uint64_t m_seq;
#endif /* UNIV_DEBUG */

  /** Remove GAP lock from a next Key Lock */
  void remove_gap_lock() {
    ut_ad(!is_gap());
    ut_ad(!is_insert_intention());
    ut_ad(is_record_lock());

    type_mode |= LOCK_REC_NOT_GAP;
  }

  /** Determine if the lock object is a record lock.
  @return true if record lock, false otherwise. */
  bool is_record_lock() const { return (type() == LOCK_REC); }

  /** Determine if it is predicate lock.
  @return true if predicate lock, false otherwise. */
  bool is_predicate() const {
    return (type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE));
  }

  /** @return true if the lock wait flag is set */
  bool is_waiting() const { return (type_mode & LOCK_WAIT); }

  /** @return true if the gap lock bit is set */
  bool is_gap() const { return (type_mode & LOCK_GAP); }

  /** @return true if the not gap lock bit is set */
  bool is_record_not_gap() const { return (type_mode & LOCK_REC_NOT_GAP); }

  /** @return true if the insert intention bit is set */
  bool is_insert_intention() const {
    return (type_mode & LOCK_INSERT_INTENTION);
  }

  /** @return the lock mode */
  uint32_t type() const { return (type_mode & LOCK_TYPE_MASK); }

  /** @return the precise lock mode */
  lock_mode mode() const {
    return (static_cast<lock_mode>(type_mode & LOCK_MODE_MASK));
  }

  /** Get lock hash table
  @return lock hash table */
  hash_table_t *hash_table() const { return (lock_hash_get(type_mode)); }

  /** @return the record lock tablespace ID */
  space_id_t space_id() const {
    ut_ad(is_record_lock());

    return (rec_lock.space);
  }

  /** @return the record lock page number */
  page_no_t page_no() const {
    ut_ad(is_record_lock());

    return (rec_lock.page_no);
  }

  /** @return the transaction's query thread state. */
  trx_que_t trx_que_state() const { return (trx->lock.que_state); }

  /** Print the lock object into the given output stream.
  @param[in,out]	out	the output stream
  @return the given output stream. */
  std::ostream &print(std::ostream &out) const;

  /** Convert the member 'type_mode' into a human readable string.
  @return human readable string */
  std::string type_mode_string() const;

  /* @return the string/text representation of the record type. */
  const char *type_string() const {
    switch (type_mode & LOCK_TYPE_MASK) {
      case LOCK_REC:
        return ("LOCK_REC");
      case LOCK_TABLE:
        return ("LOCK_TABLE");
      default:
        ut_error;
    }
  }
};

不论行锁还是表锁,都记录了所属的事务,锁创建后会通过 trx_locks(next和prev指针)添加到所属事务已获得锁链表中( trx_t.lock.trx_locks ),如果创建的锁为行锁,会使用(space_no , page_no )经过hash函数计算后添加到 lock_sys 中。如果创建的锁为表锁,将其加入到对应dict_table_t的表锁链表尾部(lock_table_create)。

行锁与表锁

InnoDB 中一个事务中的行锁是按照 page 和锁类型组织,相同的锁类型,锁一行和锁这行所在page中所有行的存储代价是一样的,lock_rec_t 中的成员 nbits 后面有一个 bitmapbitmap 占用的存储空间为:(1 + (nbits-1)/8) bytes,即:lock_rec_t 占用的实际存储空间为:sizeof(lock_rec_t) + 1 + (nbits-1)/8,bitmap 中的每一位标识对应的行是否加锁

/** Record lock for a page */
struct lock_rec_t {
  space_id_t space;  /*!< space id */
  page_no_t page_no; /*!< page number */
  uint32_t n_bits;   /*!< number of bits in the lock
                     bitmap; NOTE: the lock bitmap is
                     placed immediately after the
                     lock struct */

  /** Print the record lock into the given output stream
  @param[in,out]	out	the output stream
  @return the given output stream. */
  std::ostream &print(std::ostream &out) const;
};

lock_table_t 表锁,保存了对应的 table 和 table上表锁链接 locks(prev和next指针)

/** A table lock */
struct lock_table_t {
  dict_table_t *table; /*!< database table in dictionary
                       cache */
  UT_LIST_NODE_T(lock_t)
  locks; /*!< list of locks on the same
         table */
  /** Print the table lock into the given output stream
  @param[in,out]	out	the output stream
  @return the given output stream. */
  std::ostream &print(std::ostream &out) const;
};

lock_t 表示一个锁,表锁或者行锁,其中的 trx_locks 成员会被链接到 trx_t 中的已加锁链表 trx_t.lock.trx_locks 中,而 lock_table_t 中的 locks 成员是需要链接到对应 table 后面,作用不同但容易混淆;lock_t中的 hash 和 index 成员放到 lock_rec_t 结构体中更加合适,可能是基于 lock_t 是行锁的可能性更大的前提考虑的

Wait-For-Graph在InnoDB中的实现

InnoDB 在实现 Wait-For-Graph 时基于性能方面的考虑,定义了两个变量:

LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK (默认200):深度优先遍历的层数超过此值,即认为发生死锁,回滚当前事务

LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK(默认1000000):lock_deadlock_recursive递归调用的次数超过此值,即认为发生死锁,回滚当前事务

笔者测试的版本中,死锁检测单独为一个类 DeadlockChecker

class DeadlockChecker {
 public:
  /** Checks if a joining lock request results in a deadlock. If
  a deadlock is found this function will resolve the deadlock
  by choosing a victim transaction and rolling it back. It
  will attempt to resolve all deadlocks. The returned transaction
  id will be the joining transaction id or 0 if some other
  transaction was chosen as a victim and rolled back or no
  deadlock found.

  @param lock lock the transaction is requesting
  @param trx transaction requesting the lock

  @return id of transaction chosen as victim or 0 */
  static const trx_t *	(const lock_t *lock, trx_t *trx);

 private:
  /** Do a shallow copy. Default destructor OK.
  @param trx the start transaction (start node)
  @param wait_lock lock that a transaction wants
  @param mark_start visited node counter */
  DeadlockChecker(const trx_t *trx, const lock_t *wait_lock,
                  uint64_t mark_start)
      : m_cost(),
        m_start(trx),
        m_too_deep(),
        m_wait_lock(wait_lock),
        m_mark_start(mark_start),
        m_n_elems() {}

  /** Check if the search is too deep. */
  bool is_too_deep() const {
    return (m_n_elems > LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK ||
            m_cost > LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK);
  }

  /** Save current state.
  @param lock lock to push on the stack.
  @param heap_no the heap number to push on the stack.
  @return false if stack is full. */
  bool push(const lock_t *lock, ulint heap_no) {
    ut_ad((lock_get_type_low(lock) & LOCK_REC) ||
          (lock_get_type_low(lock) & LOCK_TABLE));

    ut_ad(((lock_get_type_low(lock) & LOCK_TABLE) != 0) ==
          (heap_no == ULINT_UNDEFINED));

    /* Ensure that the stack is bounded. */
    if (m_n_elems >= UT_ARR_SIZE(s_states)) {
      return (false);
    }

    state_t &state = s_states[m_n_elems++];

    state.m_lock = lock;
    state.m_wait_lock = m_wait_lock;
    state.m_heap_no = heap_no;

    return (true);
  }

  /** Restore state.
  @param[out] lock current lock
  @param[out] heap_no current heap_no */
  void pop(const lock_t *&lock, ulint &heap_no) {
    ut_a(m_n_elems > 0);

    const state_t &state = s_states[--m_n_elems];

    lock = state.m_lock;
    heap_no = state.m_heap_no;
    m_wait_lock = state.m_wait_lock;
  }

  /** Check whether the node has been visited.
  @param lock lock to check
  @return true if the node has been visited */
  bool is_visited(const lock_t *lock) const {
    return (lock->trx->lock.deadlock_mark > m_mark_start);
  }

  /** Get the next lock in the queue that is owned by a transaction
  whose sub-tree has not already been searched.
  @param lock Lock in queue
  @param heap_no heap_no if lock is a record lock else ULINT_UNDEFINED
  @return next lock or NULL if at end of queue */
  const lock_t *get_next_lock(const lock_t *lock, ulint heap_no) const;

  /** Get the first lock to search. The search starts from the current
  wait_lock. What we are really interested in is an edge from the
  current wait_lock's owning transaction to another transaction that has
  a lock ahead in the queue. We skip locks where the owning transaction's
  sub-tree has already been searched.

  For record locks, we first position the iterator on first lock on
  the page and then reposition on the actual heap_no. This is required
  due to the way the record lock has is implemented.

  @param[out] heap_no if rec lock, else ULINT_UNDEFINED.

  @return first lock or NULL */
  const lock_t *get_first_lock(ulint *heap_no) const;

  /** Notify that a deadlock has been detected and print the conflicting
  transaction info.
  @param lock lock causing deadlock */
  void notify(const lock_t *lock) const;

  /** Select the victim transaction that should be rolledback.
  @return victim transaction */
  const trx_t *select_victim() const;

  /** Rollback transaction selected as the victim. */
  void trx_rollback();

  /** Looks iteratively for a deadlock. Note: the joining transaction
  may have been granted its lock by the deadlock checks.

  @return 0 if no deadlock else the victim transaction.*/
  const trx_t *search();

#ifdef UNIV_DEBUG
  /** Determines if a situation in which the lock takes part in a deadlock
  cycle is expected (as in: handled correctly) or not (say because it is on a DD
  table, for which there is no reason to expect a deadlock and we don't handle
  deadlocks correctly). The purpose of the function is to use it in an assertion
  failing as soon as the deadlock is identified, to give developer a chance to
  investigate the root cause of the situation (without such assertion, the code
  might continue to run and either fail at later stage when the data useful for
  debugging is no longer on stack, or not fail at all, which is risky).
  @param[in] lock lock found in a deadlock cycle
  @return true if we expect that this lock can take part in a deadlock cycle */
  static bool is_allowed_to_be_on_cycle(const lock_t *lock);
#endif /* UNIV_DEBUG */

  /** Print transaction data to the deadlock file and possibly to stderr.
  @param trx transaction
  @param max_query_len max query length to print */
  static void print(const trx_t *trx, ulint max_query_len);

  /** rewind(3) the file used for storing the latest detected deadlock
  and print a heading message to stderr if printing of all deadlocks to
  stderr is enabled. */
  static void start_print();

  /** Print lock data to the deadlock file and possibly to stderr.
  @param lock record or table type lock */
  static void print(const lock_t *lock);

  /** Print a message to the deadlock file and possibly to stderr.
  @param msg message to print */
  static void print(const char *msg);

  /** Print info about transaction that was rolled back.
  @param trx transaction rolled back
  @param lock lock trx wants */
  static void rollback_print(const trx_t *trx, const lock_t *lock);

 private:
  /** DFS state information, used during deadlock checking. */
  struct state_t {
    const lock_t *m_lock;      /*!< Current lock */
    const lock_t *m_wait_lock; /*!< Waiting for lock */
    ulint m_heap_no;           /*!< heap number if rec lock */
  };

  /** Used in deadlock tracking. Protected by lock_sys->mutex. */
  static uint64_t s_lock_mark_counter;

  /** Calculation steps thus far. It is the count of the nodes visited. */
  ulint m_cost;

  /** Joining transaction that is requesting a lock in an
  incompatible mode */
  const trx_t *m_start;

  /** true if search was too deep and was aborted */
  bool m_too_deep;

  /** Lock that trx wants */
  const lock_t *m_wait_lock;

  /**  Value of lock_mark_count at the start of the deadlock check. */
  uint64_t m_mark_start;

  /** Number of states pushed onto the stack */
  size_t m_n_elems;

  /** This is to avoid malloc/free calls. */
  static state_t s_states[MAX_STACK_SIZE];
};
/** Looks iteratively for a deadlock. Note: the joining transaction may
have been granted its lock by the deadlock checks.
@return 0 if no deadlock else the victim transaction instance.*/
const trx_t *DeadlockChecker::search() {
  ut_ad(lock_mutex_own());
  ut_ad(!trx_mutex_own(m_start));

  ut_ad(m_start != NULL);
  ut_ad(m_wait_lock != NULL);
  check_trx_state(m_wait_lock->trx);
  ut_ad(m_mark_start <= s_lock_mark_counter);

  /* Look at the locks ahead of wait_lock in the lock queue. */
  ulint heap_no;
  const lock_t *lock = get_first_lock(&heap_no);

  for (;;) {
    /* We should never visit the same sub-tree more than once. */
    ut_ad(lock == NULL || !is_visited(lock));

    while (m_n_elems > 0 && lock == NULL) {
      /* Restore previous search state. */

      pop(lock, heap_no);

      lock = get_next_lock(lock, heap_no);
    }

    if (lock == NULL) {
      break;
    } else if (lock == m_wait_lock) {
      /* We can mark this subtree as searched */
      ut_ad(lock->trx->lock.deadlock_mark <= m_mark_start);

      lock->trx->lock.deadlock_mark = ++s_lock_mark_counter;

      /* We are not prepared for an overflow. This 64-bit
      counter should never wrap around. At 10^9 increments
      per second, it would take 10^3 years of uptime. */

      ut_ad(s_lock_mark_counter > 0);

      /* Backtrack */
      lock = NULL;

    } else if (!lock_has_to_wait(m_wait_lock, lock)) {
      /* No conflict, next lock */
      lock = get_next_lock(lock, heap_no);

    } else if (lock->trx == m_start) {
      /* Found a cycle. */

      notify(lock);

      /* We don't expect deadlocks with most DD tables and all SDI tables.
      If we find, we crash early to find the transactions causing deadlock */
      ut_ad(is_allowed_to_be_on_cycle(lock));
      ut_ad(is_allowed_to_be_on_cycle(m_wait_lock));

      return (select_victim());

    } else if (is_too_deep()) {
      /* Search too deep to continue. */
      m_too_deep = true;
      return (m_start);

    } else if (lock->trx_que_state() == TRX_QUE_LOCK_WAIT) {
      /* Another trx ahead has requested a lock in an
      incompatible mode, and is itself waiting for a lock. */

      ++m_cost;

      if (!push(lock, heap_no)) {
        m_too_deep = true;
        return (m_start);
      }

      m_wait_lock = lock->trx->lock.wait_lock;

      lock = get_first_lock(&heap_no);

      if (is_visited(lock)) {
        lock = get_next_lock(lock, heap_no);
      }

    } else {
      lock = get_next_lock(lock, heap_no);
    }
  }

  ut_a(lock == NULL && m_n_elems == 0);

  /* No deadlock found. */
  return (0);
}

InnoDB通常在哪些情况下可能发生死锁

回顾一下cs课本上关于发生死锁的几个条件:

  1. 资源互斥;
  2. 请求保持;
  3. 不剥夺;
  4. 循环等待

对于DB而言,导致死锁意味着发生了循环等待,在InnoDB中由于行锁的引入,比较容易发生死锁,下面总结一些发生死锁的情况(不全):

  1. 同一索引上,两个session相反的顺序加锁多行记录
  2. Primary key和Secondary index,通过primary key找到记录,更新Secondary index字段与通过Secondary index更新记录
  3. UPDATE/DELETE通过不同的二级索引更新多条记录,可能造成在Primary key上不同的加锁顺序
posted @ 2019-12-11 17:36  菁芜  阅读(507)  评论(0编辑  收藏  举报