【GreatSQL优化器-10】find_best_ref
【GreatSQL优化器-10】find_best_ref
一、find_best_ref介绍
GreatSQL的优化器对于join的表需要根据行数和cost来确定最后哪张表先执行哪张表后执行,这里面就涉及到预估满足条件的表数据,在keyuse_array数组有值的情况下,会用find_best_ref函数来通过索引进行cost和rows的估计,并且会找出最优的索引。这样就可能不会继续用后面的calculate_scan_cost()进行全表扫描计算cost,可以节省查询时间。
这个功能是对之前【优化器05-条件过滤】的补充功能,二者有可能一起用,也有可能只选择一种计算,要看具体条件。
下面用一个简单的例子来说明find_best_ref函数获得的结果。
CREATE TABLE t1 (c1 INT PRIMARY KEY, c2 INT,date1 DATETIME);
INSERT INTO t1 VALUES (1,10,'2021-03-25 16:44:00.123456'),(2,1,'2022-03-26 16:44:00.123456'),(3,4,'2023-03-27 16:44:00.123456'),(5,5,'2024-03-25 16:44:00.123456'),(7,null,'2020-03-25 16:44:00.123456'),(8,10,'2020-10-25 16:44:00.123456'),(11,16,'2023-03-25 16:44:00.123456');
CREATE TABLE t2 (cc1 INT PRIMARY KEY, cc2 INT);
INSERT INTO t2 VALUES (1,3),(2,1),(3,2),(4,3),(5,15);
CREATE TABLE t3 (ccc1 INT, ccc2 varchar(100));
INSERT INTO t3 VALUES (1,'aa1'),(2,'bb1'),(3,'cc1'),(4,'dd1'),(null,'ee');
CREATE INDEX idx1 ON t1(c2);
CREATE INDEX idx2 ON t1(c2,date1);
CREATE INDEX idx2_1 ON t2(cc2);
CREATE INDEX idx3_1 ON t3(ccc1);
greatsql> SELECT * FROM t1 join t2 ON t1.c1=t2.cc1 and t1.c2<5;
          {
            "ref_optimizer_key_uses": [ 首先这里要有keyuse_array
              {
                "table": "`t1`",
                "field": "c1",
                "equals": "`t2`.`cc1`",
                "null_rejecting": true
              },
              {
                "table": "`t2`",
                "field": "cc1",
                "equals": "`t1`.`c1`",
                "null_rejecting": true
              }
            ]
          },
            "considered_execution_plans": [
              {
                "plan_prefix": [
                ],
                "table": "`t1`",
                "best_access_path": {
                  "considered_access_paths": [
                    {
                      "access_type": "ref",
                      "index": "PRIMARY",
                      "usable": false,
                      "chosen": false
                    },
                    {
                      "rows_to_scan": 2,
                      "filtering_effect": [
                      ],
                      "final_filtering_effect": 1,
                      "access_type": "range",
                      "range_details": {
                        "used_index": "idx2"
                      },
                      "resulting_rows": 2,
                      "cost": 0.660457,
                      "chosen": true
                    }
                  ]
                },
                "condition_filtering_pct": 100,
                "rows_for_plan": 2,
                "cost_for_plan": 0.660457,
                "rest_of_plan": [
                  {
                    "plan_prefix": [
                      "`t1`"
                    ],
                    "table": "`t2`",
                    "best_access_path": {
                      "considered_access_paths": [
                        {
                          "access_type": "eq_ref",
                          "index": "PRIMARY",
                          "rows": 1, 这里就是通过find_best_ref()获得的结果
                          "cost": 0.7, 这里就是通过find_best_ref()获得的结果
                          "chosen": true,
                          "cause": "clustered_pk_chosen_by_heuristics"
                        },
                        {
                          "access_type": "scan",
                          "chosen": false,
                          "cause": "covering_index_better_than_full_scan"
                        }
                      ]
                    },
                    "condition_filtering_pct": 100,
                    "rows_for_plan": 2,
                    "cost_for_plan": 1.36046,
                    "chosen": true
                  }
                ]
              },
              {
                "plan_prefix": [
                ],
                "table": "`t2`",
                "best_access_path": {
                  "considered_access_paths": [
                    {
                      "access_type": "ref",
                      "index": "PRIMARY",
                      "usable": false,
                      "chosen": false
                    },
                    {
                      "rows_to_scan": 5,
                      "filtering_effect": [
                      ],
                      "final_filtering_effect": 1,
                      "access_type": "scan",
                      "resulting_rows": 5,
                      "cost": 0.75,
                      "chosen": true
                    }
                  ]
                },
                "condition_filtering_pct": 100,
                "rows_for_plan": 5,
                "cost_for_plan": 0.75,
                "rest_of_plan": [
                  {
                    "plan_prefix": [
                      "`t2`"
                    ],
                    "table": "`t1`",
                    "best_access_path": {
                      "considered_access_paths": [
                        {
                          "access_type": "eq_ref",
                          "index": "PRIMARY",
                          "rows": 1, 这里就是通过find_best_ref()获得的结果
                          "cost": 5.5, 这里就是通过find_best_ref()获得的结果
                          "chosen": true,
                          "cause": "clustered_pk_chosen_by_heuristics"
                        },
                        {
                          "rows_to_scan": 2,
                          "filtering_effect": [
                          ],
                          "final_filtering_effect": 1,
                          "access_type": "range",
                          "range_details": {
                            "used_index": "idx2"
                          },
                          "resulting_rows": 2,
                          "cost": 3.30229, 
                          "chosen": true
                        }
                      ]
                    },
                    "condition_filtering_pct": 100,
                    "rows_for_plan": 10,
                    "cost_for_plan": 4.05229,
                    "pruned_by_cost": true
                  }
                ]
              }
            ]
二、find_best_ref代码解释
find_best_ref的计算在best_access_path函数实现,用来在keyuse_array数组有值的时候,预估不同join连接时候的join表的读取行数和可能的cost,以及找出cost最低的索引。
void Optimize_table_order::best_access_path(JOIN_TAB *tab,
                                            const table_map remaining_tables,
                                            const uint idx, bool disable_jbuf,
                                            const double prefix_rowcount,
                                            POSITION *pos) {
  if (tab->keyuse() != nullptr &&
      (table->file->ha_table_flags() & HA_NO_INDEX_ACCESS) == 0 &&
      (!table->set_counter()))
    best_ref =
        find_best_ref(tab, remaining_tables, idx, prefix_rowcount,
                      &found_condition, &ref_depend_map, &used_key_parts);  
  接着会进行一系列判断来决定是否要继续用calculate_scan_cost()方式来进行估计cost,然后二者进行比较选取更小的cost。
  具体见下面<<采用的rows和cost结果>>表
  两个方式都会使用calculate_condition_filter()计算条件过滤百分比。
}
Key_use *Optimize_table_order::find_best_ref() {
  ha_rows distinct_keys_est = tab->records() / MATCHING_ROWS_IN_OTHER_TABLE;
  // 遍历keyuse_array数组,按照索引进行计算行数和cost,选取最优的索引
  for (Key_use *keyuse = tab->keyuse(); keyuse->table_ref == tab->table_ref;) {
   如果是第一张表就跳过,因为第一张表不需要用索引扫描。
   主要计算得出下面<<find_best_ref函数获得的结果>>表里的三个变量结果,用于找出最优索引。
  }
}
表:采用的rows和cost结果
| 采用的rows和cost结果 | 场景(以下任一个条件满足) | 
|---|---|
| find_best_ref()的结果 | find_best_ref()方式找到的行数和cost都小于之前estimate_rowcount计算的值 | 
| 生成了mm tree并且找到最优索引,并且range_scan()->type结果为ROWID_INTERSECTION | |
| 没有生成mm tree但是找到最优索引 | |
| force index方式 | |
| calculate_scan_cost()的结果 | 以上都不满足 | 
表:find_best_ref函数获得的结果
| 函数结果 | 说明 | 值 | 
|---|---|---|
| cur_fanout | 总共需要读取多少行 | 计算方式见表一 | 
| cur_read_cost | 读的开销 | 计算方式见表二 | 
| best_ref | cost最低的索引 | 索引优先级见表五 | 
表一:cur_fanout计算方式
| 场景 | 值 | 说明 | |
|---|---|---|---|
| 唯一索引 | 1 | ||
| 索引覆盖所有涉及列并且条件是列等于常量 | table->quick_keys有值 | table->quick_rows | |
| table->quick_keys无值 | tab->records() / distinct_keys_est | distinct_keys_est表示的是一张表对应一个索引值的行数有几行,计算方式见表三 | |
| 索引覆盖所有涉及列并且条件不是列等于常量 | keyinfo->has_records_per_key() | keyinfo->records_per_key | 计算方式见表四 | 
| !keyinfo->has_records_per_key() | ((double)tab->records() / (double)distinct_keys_est * (1.0 + ((double)(table->s->max_key_length - keyinfo->key_length) / (double)table->s->max_key_length))); if (cur_fanout < 2.0) cur_fanout = 2.0 | ||
| 索引不能覆盖所有涉及列并且条件是列等于常量 | table->quick_rows | ||
| 索引不能覆盖所有涉及列并且条件不是列等于常量 | 当前用的索引keyinfo->has_records_per_key() | keyinfo->records_per_key | records_per_key表示的是一张表对应一个索引值的行数有几行,计算方式见表四 | 
| 当前用的索引没有keyinfo->has_records_per_key() | (x * (b-a) + a*c-b)/(c-1) | b = records matched by whole key a = records matched by first key part c = number of key parts in key x = used key parts (1 <= x <= c) | 
表二:cur_read_cost计算方式
| 场景 | 值 | 
|---|---|
| 唯一索引 | 之前左连接表的行数卷积 * table->cost_model()->page_read_cost(1.0) | 
| 索引覆盖所有涉及列 | prefix_rowcount * find_cost_for_ref(thd, table, key, cur_fanout, tab->worst_seeks) | 
| 索引不能覆盖所有涉及列 | prefix_rowcount * find_cost_for_ref(thd, table, key, cur_fanout, tab->worst_seeks) | 
表三:distinct_keys_est计算方式
| 场景 | 值 | 
|---|---|
| 初始值 | tab->records() / 10 | 
| distinct_keys_est > keyuse->ref_table_rows | keyuse->ref_table_rows | 
| distinct_keys_est < 10 | 10 | 
| tab->records() < distinct_keys_est | tab->records() | 
注:10是MATCHING_ROWS_IN_OTHER_TABLE
表四:keyuse->ref_table_rows计算方式
| keyuse->used_tables | 场景 | 值 | 
|---|---|---|
| PSEUDO_TABLE_BITS | 只有一张表 | max(tmp_table->file->stats.records, 100) 这里100是固定值 | 
| OUTER_REF_TABLE_BIT | 有子查询的话只有一行满足 | 1 | 
注:通过JOIN::optimize_keyuse()函数获得
表五:索引类型选择优先级
| idx_type | 优先级 | 说明 | 对应的access_type | 
|---|---|---|---|
| CLUSTERED_PK | 最高 | primary key,优先级最高 | eq_ref | 
| UNIQUE | 高 | 唯一索引 | eq_ref | 
| NOT_UNIQUE | 低 | 非唯一索引 | ref | 
| FULLTEXT | 最低 | fulltext索引,优先级最低 | ref | 
三、实际例子说明
接下来看一开始的例子来说明上面的代码:
greatsql> SELECT * FROM t1 JOIN t2 ON t1.c1=t2.cc1 AND t1.c2<5;
          {
            "ref_optimizer_key_uses": [
              {
                "table": "`t1`",
                "field": "c1",
                "equals": "`t2`.`cc1`",
                "null_rejecting": true
              },
              {
                "table": "`t2`",
                "field": "cc1",
                "equals": "`t1`.`c1`",
                "null_rejecting": true
              }
            ]
          },
            "considered_execution_plans": [
              {
                "plan_prefix": [
                ],
                "table": "`t1`",
                "best_access_path": {
                  "considered_access_paths": [
                    {
                      "access_type": "ref", 第一张表不考虑用ref扫描
                      "index": "PRIMARY",
                      "usable": false,
                      "chosen": false
                    },
                    {
                      "rows_to_scan": 2, 用非best_ref方式扫描
                      "filtering_effect": [
                      ],
                      "final_filtering_effect": 1,
                      "access_type": "range",
                      "range_details": {
                        "used_index": "idx2"
                      },
                      "resulting_rows": 2,
                      "cost": 0.660457,
                      "chosen": true
                    }
                  ]
                },
                "condition_filtering_pct": 100,
                "rows_for_plan": 2,
                "cost_for_plan": 0.660457,
                "rest_of_plan": [
                  {
                    "plan_prefix": [
                      "`t1`"
                    ],
                    "table": "`t2`",
                    "best_access_path": {
                      "considered_access_paths": [
                        {
                          "access_type": "eq_ref",
                          "index": "PRIMARY",
                          "rows": 1,  eq_ref扫描固定只有一行
                          "cost": 0.7,  计算公式=0.5(cur_read_cost) + 2(上一张表的行数) * 1(这张表行数) * 0.1
                          "chosen": true,
                          "cause": "clustered_pk_chosen_by_heuristics"
                        },
                        {
                          "access_type": "scan", 因为t2没有mm tree因此只能选择全表扫描,又因为之前的索引扫描更优,所以这里不选择全表扫描
                          "chosen": false,
                          "cause": "covering_index_better_than_full_scan"
                        }
                      ]
                    },
                    "condition_filtering_pct": 100,
                    "rows_for_plan": 2,
                    "cost_for_plan": 1.36046,
                    "chosen": true
                  }
                ]
              },
              {
                "plan_prefix": [
                ],
                "table": "`t2`",
                "best_access_path": {
                  "considered_access_paths": [
                    {
                      "access_type": "ref", 第一张表不考虑用ref扫描
                      "index": "PRIMARY",
                      "usable": false,
                      "chosen": false
                    },
                    {
                      "rows_to_scan": 5,
                      "filtering_effect": [
                      ],
                      "final_filtering_effect": 1,
                      "access_type": "scan",
                      "resulting_rows": 5,
                      "cost": 0.75,
                      "chosen": true
                    }
                  ]
                },
                "condition_filtering_pct": 100,
                "rows_for_plan": 5,
                "cost_for_plan": 0.75,
                "rest_of_plan": [
                  {
                    "plan_prefix": [
                      "`t2`"
                    ],
                    "table": "`t1`",
                    "best_access_path": {
                      "considered_access_paths": [
                        {
                          "access_type": "eq_ref",
                          "index": "PRIMARY",
                          "rows": 1, eq_ref扫描固定只有一行
                          "cost": 5.5, 计算公式=5(cur_read_cost) + 5(上一张表的行数) * 1(这张表行数) * 0.1
                          "chosen": true,
                          "cause": "clustered_pk_chosen_by_heuristics"
                        },
                        {
                          "rows_to_scan": 2,
                          "filtering_effect": [
                          ],
                          "final_filtering_effect": 1,
                          "access_type": "range", 这里因为t1表有mm tree但是扫描方式不是ROWID_INTERSECTION所以还要继续用calculate_scan_cost来估计cost,然后跟上面用eq_ref方式扫描方式进行对比,看哪个更优
                          "range_details": {
                            "used_index": "idx2"
                          },
                          "resulting_rows": 2,
                          "cost": 3.30229, 因为这里的cost更低,因此舍弃上面的eq_ref扫描方式改为range扫描方式
                          "chosen": true
                        }
                      ]
                    },
                    "condition_filtering_pct": 100,
                    "rows_for_plan": 10,
                    "cost_for_plan": 4.05229,
                    "pruned_by_cost": true
                  }
                ]
              }
            ]
再看一个三张表连接的例子,可以看到eq_ref扫描方式也要估算条件过滤百分比,并且第一张表不用索引扫描,也就不需要执行find_best_ref函数。
greatsql> SELECT * FROM t1 JOIN t2 JOIN t3 ON t1.c1=t2.cc1 AND t3.ccc1=t2.cc1 AND t1.c2<15 AND t1.c2=10;
         {
            "considered_execution_plans": [
              {
                "plan_prefix": [
                ],
                "table": "`t1`",
                "best_access_path": {
                  "considered_access_paths": [
                    {
                      "access_type": "ref",
                      "index": "PRIMARY",
                      "usable": false,
                      "chosen": false
                    },
                    {
                      "access_type": "ref",
                      "index": "idx1",
                      "rows": 2,
                      "cost": 0.7,
                      "chosen": true
                    },
                    {
                      "access_type": "ref",
                      "index": "idx2",
                      "rows": 2,
                      "cost": 0.450457,
                      "chosen": true
                    },
                    {
                      "access_type": "range",
                      "range_details": {
                        "used_index": "idx2"
                      },
                      "chosen": false,
                      "cause": "heuristic_index_cheaper"
                    }
                  ]
                },
                "condition_filtering_pct": 100,
                "rows_for_plan": 2,
                "cost_for_plan": 0.450457,
                "rest_of_plan": [
                  {
                    "plan_prefix": [
                      "`t1`"
                    ],
                    "table": "`t2`",
                    "best_access_path": {
                      "considered_access_paths": [
                        {
                          "access_type": "eq_ref",
                          "index": "PRIMARY",
                          "rows": 1,
                          "cost": 0.7,
                          "chosen": true,
                          "cause": "clustered_pk_chosen_by_heuristics"
                        },
                        {
                          "access_type": "scan",
                          "chosen": false,
                          "cause": "covering_index_better_than_full_scan"
                        }
                      ]
                    },
                    "condition_filtering_pct": 100,
                    "rows_for_plan": 2,
                    "cost_for_plan": 1.15046,
                    "rest_of_plan": [
                      {
                        "plan_prefix": [
                          "`t1`",
                          "`t2`"
                        ],
                        "table": "`t3`",
                        "best_access_path": {
                          "considered_access_paths": [
                            {
                              "access_type": "ref",
                              "index": "idx3_1",
                              "rows": 1,
                              "cost": 0.7,
                              "chosen": true
                            },
                            {
                              "rows_to_scan": 5,
                              "filtering_effect": [
                              ],
                              "final_filtering_effect": 1,
                              "access_type": "scan",
                              "using_join_cache": true,
                              "buffers_needed": 1,
                              "resulting_rows": 5,
                              "cost": 1.25004,
                              "chosen": false
                            }
                          ]
                        },
                        "added_to_eq_ref_extension": true,
                        "condition_filtering_pct": 100,
                        "rows_for_plan": 2,
                        "cost_for_plan": 1.85046,
                        "chosen": true
                      }
                    ]
                  }
                ]
              },
              {
                "plan_prefix": [
                ],
                "table": "`t2`",
                "best_access_path": {
                  "considered_access_paths": [
                    {
                      "access_type": "ref",
                      "index": "PRIMARY",
                      "usable": false,
                      "chosen": false
                    },
                    {
                      "rows_to_scan": 5,
                      "filtering_effect": [
                      ],
                      "final_filtering_effect": 1,
                      "access_type": "scan",
                      "resulting_rows": 5,
                      "cost": 0.75,
                      "chosen": true
                    }
                  ]
                },
                "condition_filtering_pct": 100,
                "rows_for_plan": 5,
                "cost_for_plan": 0.75,
                "rest_of_plan": [
                  {
                    "plan_prefix": [
                      "`t2`"
                    ],
                    "table": "`t1`",
                    "best_access_path": {
                      "considered_access_paths": [
                        {
                          "access_type": "eq_ref",
                          "index": "PRIMARY",
                          "rows": 1,
                          "cost": 1.75,
                          "chosen": true,
                          "cause": "clustered_pk_chosen_by_heuristics"
                        },
                        {
                          "rows_to_scan": 2,
                          "filtering_effect": [
                          ],
                          "final_filtering_effect": 1,
                          "access_type": "range",
                          "range_details": {
                            "used_index": "idx2"
                          },
                          "resulting_rows": 2,
                          "cost": 3.30229,
                          "chosen": false
                        }
                      ]
                    },
                    "condition_filtering_pct": 28.5714, 这里看到eq_ref扫描方式也要估算条件过滤百分比
                    "rows_for_plan": 1.42857,
                    "cost_for_plan": 2.5,
                    "pruned_by_cost": true
                  },
                  {
                    "plan_prefix": [
                      "`t2`"
                    ],
                    "table": "`t3`",
                    "best_access_path": {
                      "considered_access_paths": [
                        {
                          "access_type": "ref",
                          "index": "idx3_1",
                          "rows": 1,
                          "cost": 1.75,
                          "chosen": true
                        },
                        {
                          "rows_to_scan": 5,
                          "filtering_effect": [
                          ],
                          "final_filtering_effect": 1,
                          "access_type": "scan",
                          "using_join_cache": true,
                          "buffers_needed": 1,
                          "resulting_rows": 5,
                          "cost": 2.75004,
                          "chosen": false
                        }
                      ]
                    },
                    "condition_filtering_pct": 100,
                    "rows_for_plan": 5,
                    "cost_for_plan": 2.5,
                    "pruned_by_cost": true
                  }
                ]
              },
              {
                "plan_prefix": [
                ],
                "table": "`t3`",
                "best_access_path": {
                  "considered_access_paths": [
                    {
                      "access_type": "ref",
                      "index": "idx3_1",
                      "usable": false,
                      "chosen": false
                    },
                    {
                      "rows_to_scan": 5,
                      "filtering_effect": [
                      ],
                      "final_filtering_effect": 1,
                      "access_type": "scan",
                      "resulting_rows": 5,
                      "cost": 0.75,
                      "chosen": true
                    }
                  ]
                },
                "condition_filtering_pct": 100,
                "rows_for_plan": 5,
                "cost_for_plan": 0.75,
                "rest_of_plan": [
                  {
                    "plan_prefix": [
                      "`t3`"
                    ],
                    "table": "`t1`",
                    "best_access_path": {
                      "considered_access_paths": [
                        {
                          "access_type": "eq_ref",
                          "index": "PRIMARY",
                          "rows": 1,
                          "cost": 1.75,
                          "chosen": true,
                          "cause": "clustered_pk_chosen_by_heuristics"
                        },
                        {
                          "rows_to_scan": 2,
                          "filtering_effect": [
                          ],
                          "final_filtering_effect": 1,
                          "access_type": "range",
                          "range_details": {
                            "used_index": "idx2"
                          },
                          "resulting_rows": 2,
                          "cost": 3.30229,
                          "chosen": false
                        }
                      ]
                    },
                    "condition_filtering_pct": 28.5714,
                    "rows_for_plan": 1.42857,
                    "cost_for_plan": 2.5,
                    "pruned_by_cost": true
                  },
                  {
                    "plan_prefix": [
                      "`t3`"
                    ],
                    "table": "`t2`",
                    "best_access_path": {
                      "considered_access_paths": [
                        {
                          "access_type": "eq_ref",
                          "index": "PRIMARY",
                          "rows": 1,
                          "cost": 1.75,
                          "chosen": true,
                          "cause": "clustered_pk_chosen_by_heuristics"
                        },
                        {
                          "access_type": "scan",
                          "chosen": false,
                          "cause": "covering_index_better_than_full_scan"
                        }
                      ]
                    },
                    "condition_filtering_pct": 100,
                    "rows_for_plan": 5,
                    "cost_for_plan": 2.5,
                    "pruned_by_cost": true
                  }
                ]
              }
            ]
          },
四、总结
从上面优化器的步骤我们认识了find_best_ref()进行rows和cost估计的过程,以及知道了使用这个结果的条件。需要注意的是,只有"ref_optimizer_key_uses"项目有值的情况才有可能使用这个结果。
Enjoy GreatSQL 😃
关于 GreatSQL
GreatSQL是适用于金融级应用的国内自主开源数据库,具备高性能、高可靠、高易用性、高安全等多个核心特性,可以作为MySQL或Percona Server的可选替换,用于线上生产环境,且完全免费并兼容MySQL或Percona Server。
相关链接: GreatSQL社区 Gitee GitHub Bilibili
GreatSQL社区:
社区博客有奖征稿详情:https://greatsql.cn/thread-100-1-1.html

技术交流群:
微信:扫码添加
GreatSQL社区助手微信好友,发送验证信息加群。

 
                    
                     
                    
                 
                    
                
 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号