Skip to content

Commit 81370d4

Browse files
authored
planner: add skew risk ratio for range pred (pingcap#62035)
close pingcap#62093
1 parent 0cd280b commit 81370d4

File tree

10 files changed

+102
-15
lines changed

10 files changed

+102
-15
lines changed

pkg/planner/cardinality/BUILD.bazel

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ go_test(
6060
data = glob(["testdata/**"]),
6161
embed = [":cardinality"],
6262
flaky = True,
63-
shard_count = 33,
63+
shard_count = 34,
6464
deps = [
6565
"//pkg/config",
6666
"//pkg/domain",

pkg/planner/cardinality/row_count_index.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -600,7 +600,7 @@ func matchPrefix(row chunk.Row, colIdx int, ad *types.Datum) bool {
600600
}
601601

602602
// betweenRowCountOnIndex estimates the row count for interval [l, r).
603-
// The input sctx is just for debug trace, you can pass nil safely if that's not needed.
603+
// The input sctx is required for stats version 2. For version 1, it is just for debug trace, you can pass nil safely.
604604
func betweenRowCountOnIndex(sctx planctx.PlanContext, idx *statistics.Index, l, r types.Datum) float64 {
605605
histBetweenCnt := idx.Histogram.BetweenRowCount(sctx, l, r)
606606
if idx.StatsVer == statistics.Version1 {

pkg/planner/cardinality/selectivity_test.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1578,3 +1578,56 @@ func TestRiskEqSkewRatio(t *testing.T) {
15781578
// Reset global variable to default.
15791579
testKit.MustExec("set @@global.tidb_opt_risk_eq_skew_ratio = default")
15801580
}
1581+
1582+
func TestRiskRangeSkewRatio(t *testing.T) {
1583+
store, dom := testkit.CreateMockStoreAndDomain(t)
1584+
testKit := testkit.NewTestKit(t, store)
1585+
1586+
testKit.MustExec("use test")
1587+
testKit.MustExec("drop table if exists t")
1588+
testKit.MustExec("create table t(a int, index idx(a))")
1589+
is := dom.InfoSchema()
1590+
tb, err := is.TableByName(context.Background(), ast.NewCIStr("test"), ast.NewCIStr("t"))
1591+
require.NoError(t, err)
1592+
tblInfo := tb.Meta()
1593+
1594+
// Insert enough rows to produce skewed distribution.
1595+
testKit.MustExec("insert into t values (1), (1), (1), (1), (2), (2), (3), (4), (5), (5)")
1596+
// Do not collect topn and only collect 1 bucket to ensure later queries will be within a bucket.
1597+
testKit.MustExec(`analyze table t with 0 topn, 1 buckets`)
1598+
h := dom.StatsHandle()
1599+
require.NoError(t, h.DumpStatsDeltaToKV(true))
1600+
1601+
sctx := testKit.Session()
1602+
idxID := tblInfo.Indices[0].ID
1603+
statsTbl := h.GetTableStats(tb.Meta())
1604+
// Search for the range from 2 to 3, since there is only one bucket it will be a query within
1605+
// a bucket.
1606+
testKit.MustExec("set @@session.tidb_opt_risk_range_skew_ratio = 0")
1607+
count, _, err := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(2, 3))
1608+
require.NoError(t, err)
1609+
testKit.MustExec("set @@session.tidb_opt_risk_range_skew_ratio = 0.5")
1610+
count2, _, err2 := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(2, 3))
1611+
require.NoError(t, err2)
1612+
// Result of count2 should be larger than count because the risk ratio is higher
1613+
require.Less(t, count, count2)
1614+
testKit.MustExec("set @@session.tidb_opt_risk_range_skew_ratio = 1")
1615+
count3, _, err3 := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(2, 3))
1616+
require.NoError(t, err3)
1617+
// Result of count3 should be larger because the risk ratio is higher
1618+
require.Less(t, count2, count3)
1619+
// Repeat the prior test by setting the global variable instead of the session variable. This should have no effect.
1620+
testKit.MustExec("set @@global.tidb_opt_risk_range_skew_ratio = 0.5")
1621+
count4, _, err4 := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(2, 3))
1622+
require.NoError(t, err4)
1623+
require.Less(t, count2, count4)
1624+
// Repeat the prior test by setting the session variable to the default. Count4 should inherit the global
1625+
// variable and be less than count3.
1626+
testKit.MustExec("set @@session.tidb_opt_risk_range_skew_ratio = default")
1627+
count4, _, err4 = cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(2, 3))
1628+
require.NoError(t, err4)
1629+
require.Less(t, count4, count3)
1630+
// Reset global variable to default.
1631+
testKit.MustExec("set @@global.tidb_opt_risk_range_skew_ratio = default")
1632+
testKit.MustExec("set @@session.tidb_opt_risk_range_skew_ratio = default")
1633+
}

pkg/sessionctx/vardef/tidb_vars.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,9 @@ const (
326326
// TiDBOptRiskEqSkewRatio controls the amount of skew is applied to equal predicate estimation when a value is not found in TopN/buckets.
327327
TiDBOptRiskEqSkewRatio = "tidb_opt_risk_eq_skew_ratio"
328328

329+
// TiDBOptRiskRangeSkewRatio controls the amount of skew that is applied to range predicate estimation when a range falls within a bucket.
330+
TiDBOptRiskRangeSkewRatio = "tidb_opt_risk_range_skew_ratio"
331+
329332
// TiDBOptCPUFactor is the CPU cost of processing one expression for one row.
330333
TiDBOptCPUFactor = "tidb_opt_cpu_factor"
331334
// TiDBOptCopCPUFactor is the CPU cost of processing one expression for one row in coprocessor.
@@ -1327,6 +1330,7 @@ const (
13271330
DefOptCorrelationThreshold = 0.9
13281331
DefOptCorrelationExpFactor = 1
13291332
DefOptRiskEqSkewRatio = 0.0
1333+
DefOptRiskRangeSkewRatio = 0.0
13301334
DefOptCPUFactor = 3.0
13311335
DefOptCopCPUFactor = 3.0
13321336
DefOptTiFlashConcurrencyFactor = 24.0

pkg/sessionctx/variable/session.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1021,6 +1021,9 @@ type SessionVars struct {
10211021
// RiskEqSkewRatio is used to control the ratio of skew that is applied to equal predicates not found in TopN/buckets.
10221022
RiskEqSkewRatio float64
10231023

1024+
// RiskRangeSkewRatio is used to control the ratio of skew that is applied to range predicates that fall within a single bucket.
1025+
RiskRangeSkewRatio float64
1026+
10241027
// cpuFactor is the CPU cost of processing one expression for one row.
10251028
cpuFactor float64
10261029
// copCPUFactor is the CPU cost of processing one expression for one row in coprocessor.
@@ -2196,6 +2199,7 @@ func NewSessionVars(hctx HookContext) *SessionVars {
21962199
CorrelationThreshold: vardef.DefOptCorrelationThreshold,
21972200
CorrelationExpFactor: vardef.DefOptCorrelationExpFactor,
21982201
RiskEqSkewRatio: vardef.DefOptRiskEqSkewRatio,
2202+
RiskRangeSkewRatio: vardef.DefOptRiskRangeSkewRatio,
21992203
cpuFactor: vardef.DefOptCPUFactor,
22002204
copCPUFactor: vardef.DefOptCopCPUFactor,
22012205
CopTiFlashConcurrencyFactor: vardef.DefOptTiFlashConcurrencyFactor,

pkg/sessionctx/variable/setvar_affect.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ var isHintUpdatableVerified = map[string]struct{}{
7272
"tidb_opt_table_tiflash_scan_cost_factor": {},
7373
"tidb_opt_topn_cost_factor": {},
7474
"tidb_opt_skew_ratio": {},
75+
"tidb_opt_range_ratio": {},
7576
"tidb_index_join_batch_size": {},
7677
"tidb_index_lookup_size": {},
7778
"tidb_index_serial_scan_concurrency": {},

pkg/sessionctx/variable/sysvar.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2034,6 +2034,10 @@ var defaultSysVars = []*SysVar{
20342034
s.RiskEqSkewRatio = tidbOptFloat64(val, vardef.DefOptRiskEqSkewRatio)
20352035
return nil
20362036
}},
2037+
{Scope: vardef.ScopeGlobal | vardef.ScopeSession, Name: vardef.TiDBOptRiskRangeSkewRatio, Value: strconv.FormatFloat(vardef.DefOptRiskRangeSkewRatio, 'f', -1, 64), Type: vardef.TypeFloat, MinValue: 0, MaxValue: 1, SetSession: func(s *SessionVars, val string) error {
2038+
s.RiskRangeSkewRatio = tidbOptFloat64(val, vardef.DefOptRiskRangeSkewRatio)
2039+
return nil
2040+
}},
20372041
{Scope: vardef.ScopeGlobal | vardef.ScopeSession, Name: vardef.TiDBOptCPUFactor, Value: strconv.FormatFloat(vardef.DefOptCPUFactor, 'f', -1, 64), Type: vardef.TypeFloat, MinValue: 0, MaxValue: math.MaxUint64, SetSession: func(s *SessionVars, val string) error {
20382042
s.cpuFactor = tidbOptFloat64(val, vardef.DefOptCPUFactor)
20392043
return nil

pkg/statistics/handle/globalstats/global_stats_internal_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,8 @@ func testGlobalStatsAndSQLBinding(tk *testkit.TestKit) {
405405
tk.MustExec("use test_global_stats")
406406
tk.MustExec("set @@tidb_partition_prune_mode = 'dynamic'")
407407
tk.MustExec("set tidb_cost_model_version=2")
408+
// Disable auto analyze to ensure that stats are not automatically collected
409+
tk.MustExec("set @@global.tidb_enable_auto_analyze='OFF'")
408410

409411
// hash and range and list partition
410412
tk.MustExec("create table thash(a int, b int, key(a)) partition by hash(a) partitions 4")
@@ -439,8 +441,6 @@ func testGlobalStatsAndSQLBinding(tk *testkit.TestKit) {
439441
tk.MustExec("insert into trange values " + strings.Join(vals, ","))
440442
tk.MustExec("insert into tlist values " + strings.Join(listVals, ","))
441443

442-
// Disable auto analyze to ensure that stats are not automatically collected
443-
tk.MustExec("set @@global.tidb_enable_auto_analyze='OFF'")
444444
// before analyzing, the planner will choose TableScan to access the 1% of records
445445
tk.MustHavePlan("select * from thash where a<100", "TableFullScan")
446446
tk.MustHavePlan("select * from trange where a<100", "TableFullScan")

pkg/statistics/histogram.go

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -561,19 +561,40 @@ func (hg *Histogram) LessRowCount(sctx planctx.PlanContext, value types.Datum) f
561561
}
562562

563563
// BetweenRowCount estimates the row count where column greater or equal to a and less than b.
564-
// The input sctx is just for debug trace, you can pass nil safely if that's not needed.
564+
// The input sctx is required for stats version 2. For version 1, it is just for debug trace, you can pass nil safely.
565565
func (hg *Histogram) BetweenRowCount(sctx planctx.PlanContext, a, b types.Datum) float64 {
566-
lessCountA := hg.LessRowCount(sctx, a)
567-
lessCountB := hg.LessRowCount(sctx, b)
566+
lessCountA, bktIndexA := hg.LessRowCountWithBktIdx(sctx, a)
567+
lessCountB, bktIndexB := hg.LessRowCountWithBktIdx(sctx, b)
568568
rangeEst := lessCountB - lessCountA
569569
lowEqual, _ := hg.EqualRowCount(sctx, a, false)
570570
ndvAvg := hg.NotNullCount() / float64(hg.NDV)
571571
// If values fall in the same bucket, we may underestimate the fractional result. So estimate the low value (a) as an equals, and
572572
// estimate the high value as the default (because the input high value may be "larger" than the true high value). The range should
573573
// not be less than both the low+high - or the lesser of the estimate for the individual range of a or b is used as a bound.
574-
if rangeEst < math.Max(lowEqual, ndvAvg) && hg.NDV > 0 {
575-
result := math.Min(lessCountB, hg.NotNullCount()-lessCountA)
576-
return math.Min(result, lowEqual+ndvAvg)
574+
if rangeEst < max(lowEqual, ndvAvg) && hg.NDV > 0 {
575+
result := min(lessCountB, hg.NotNullCount()-lessCountA)
576+
rangeEst = min(result, lowEqual+ndvAvg)
577+
}
578+
// If values in the same bucket, use skewRatio to adjust the range estimate to account for potential skew.
579+
if len(hg.Buckets) != 0 && bktIndexA == bktIndexB {
580+
// sctx may be nil for stats version 1
581+
if sctx != nil {
582+
skewRatio := sctx.GetSessionVars().RiskRangeSkewRatio
583+
sctx.GetSessionVars().RecordRelevantOptVar(vardef.TiDBOptRiskRangeSkewRatio)
584+
if skewRatio > 0 {
585+
// Worst case skew is if the range includes all the rows in the bucket
586+
skewEstimate := hg.Buckets[bktIndexA].Count
587+
if bktIndexA > 0 {
588+
skewEstimate -= hg.Buckets[bktIndexA-1].Count
589+
}
590+
// If range does not include last value of its bucket, remove the repeat count from the skew estimate.
591+
if lessCountB <= float64(hg.Buckets[bktIndexA].Count-hg.Buckets[bktIndexA].Repeat) {
592+
skewEstimate -= hg.Buckets[bktIndexA].Repeat
593+
}
594+
// Add a scaled ratio of the worst case skewed estimate to our regular estimate
595+
return rangeEst + max(0, (float64(skewEstimate)-rangeEst)*skewRatio)
596+
}
597+
}
577598
}
578599
return rangeEst
579600
}

pkg/statistics/statistics_test.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,7 @@ func SubTestBuild() func(*testing.T) {
521521
require.Equal(t, 0.0, count)
522522
count, _ = col.EqualRowCount(nil, types.NewIntDatum(200000000), false)
523523
require.Equal(t, 0.0, count)
524-
count = col.BetweenRowCount(nil, types.NewIntDatum(3000), types.NewIntDatum(3500))
524+
count = col.BetweenRowCount(ctx, types.NewIntDatum(3000), types.NewIntDatum(3500))
525525
require.Equal(t, 4994, int(count))
526526
count = col.LessRowCount(nil, types.NewIntDatum(1))
527527
require.Equal(t, 5, int(count))
@@ -547,7 +547,7 @@ func SubTestBuild() func(*testing.T) {
547547
require.Equal(t, 90010, int(count))
548548
count = colv2.GreaterRowCount(types.NewIntDatum(200000000))
549549
require.Equal(t, 0.0, count)
550-
count = colv2.BetweenRowCount(nil, types.NewIntDatum(3000), types.NewIntDatum(3500))
550+
count = colv2.BetweenRowCount(ctx, types.NewIntDatum(3000), types.NewIntDatum(3500))
551551
require.Equal(t, 5001, int(count))
552552
count = colv2.LessRowCount(nil, types.NewIntDatum(1))
553553
require.Equal(t, 0, int(count))
@@ -579,9 +579,9 @@ func SubTestBuild() func(*testing.T) {
579579
require.Equal(t, 1, int(count))
580580
count = col.LessRowCount(nil, encodeKey(types.NewIntDatum(20000)))
581581
require.Equal(t, 19999, int(count))
582-
count = col.BetweenRowCount(nil, encodeKey(types.NewIntDatum(30000)), encodeKey(types.NewIntDatum(35000)))
582+
count = col.BetweenRowCount(ctx, encodeKey(types.NewIntDatum(30000)), encodeKey(types.NewIntDatum(35000)))
583583
require.Equal(t, 4999, int(count))
584-
count = col.BetweenRowCount(nil, encodeKey(types.MinNotNullDatum()), encodeKey(types.NewIntDatum(0)))
584+
count = col.BetweenRowCount(ctx, encodeKey(types.MinNotNullDatum()), encodeKey(types.NewIntDatum(0)))
585585
require.Equal(t, 0, int(count))
586586
count = col.LessRowCount(nil, encodeKey(types.NewIntDatum(0)))
587587
require.Equal(t, 0, int(count))
@@ -596,7 +596,7 @@ func SubTestBuild() func(*testing.T) {
596596
require.Equal(t, 1, int(count))
597597
count = col.LessRowCount(nil, types.NewIntDatum(20000))
598598
require.Equal(t, 20000, int(count))
599-
count = col.BetweenRowCount(nil, types.NewIntDatum(30000), types.NewIntDatum(35000))
599+
count = col.BetweenRowCount(ctx, types.NewIntDatum(30000), types.NewIntDatum(35000))
600600
require.Equal(t, 5000, int(count))
601601
count = col.GreaterRowCount(types.NewIntDatum(1001))
602602
require.Equal(t, 98998, int(count))

0 commit comments

Comments
 (0)