Skip to content

Commit 6a3361c

Browse files
xzj7019dataroaring
authored andcommitted
[fix](nereids) refine row count estimation for mark join (#38270)
Current semi/anti stats estimation doesn't consider the mark join case, whose row count should follow either side's stats without change.
1 parent 488e877 commit 6a3361c

File tree

12 files changed

+19
-19
lines changed

12 files changed

+19
-19
lines changed

fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -267,8 +267,8 @@ private static double estimateSemiOrAntiRowCountBySlotsEqual(Statistics leftStat
267267
}
268268

269269
private static Statistics estimateSemiOrAnti(Statistics leftStats, Statistics rightStats, Join join) {
270-
if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats, join)) {
271-
double sel = computeSelectivityForBuildSideWhenColStatsUnknown(rightStats, join);
270+
if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats, join) || join.isMarkJoin()) {
271+
double sel = join.isMarkJoin() ? 1.0 : computeSelectivityForBuildSideWhenColStatsUnknown(rightStats, join);
272272
if (join.getJoinType().isLeftSemiOrAntiJoin()) {
273273
return new StatisticsBuilder().setRowCount(leftStats.getRowCount() * sel)
274274
.putColumnStatistics(leftStats.columnStatistics())

regression-test/data/nereids_hint_tpcds_p0/shape/query45.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ PhysicalResultSink
99
------------hashAgg[LOCAL]
1010
--------------PhysicalProject
1111
----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
12-
------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk]
12+
------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk]
1313
--------------------PhysicalProject
1414
----------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk]
1515
------------------------PhysicalProject

regression-test/data/nereids_tpcds_shape_sf1000_p0/bs_downgrade_shape/query45.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ PhysicalResultSink
99
------------hashAgg[LOCAL]
1010
--------------PhysicalProject
1111
----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
12-
------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk]
12+
------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk]
1313
--------------------PhysicalProject
1414
----------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk]
1515
------------------------PhysicalProject

regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query45.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ PhysicalResultSink
99
------------hashAgg[LOCAL]
1010
--------------PhysicalProject
1111
----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
12-
------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk]
12+
------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk]
1313
--------------------PhysicalProject
1414
----------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk]
1515
------------------------PhysicalProject

regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query51.out

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@ PhysicalResultSink
1919
--------------------------------PhysicalDistribute[DistributionSpecHash]
2020
----------------------------------hashAgg[LOCAL]
2121
------------------------------------PhysicalProject
22-
--------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
22+
--------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ss_sold_date_sk]
2323
----------------------------------------PhysicalProject
24-
------------------------------------------PhysicalOlapScan[web_sales] apply RFs: RF1
24+
------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF1
2525
----------------------------------------PhysicalProject
2626
------------------------------------------filter((date_dim.d_month_seq <= 1223) and (date_dim.d_month_seq >= 1212))
2727
--------------------------------------------PhysicalOlapScan[date_dim]
@@ -34,9 +34,9 @@ PhysicalResultSink
3434
--------------------------------PhysicalDistribute[DistributionSpecHash]
3535
----------------------------------hashAgg[LOCAL]
3636
------------------------------------PhysicalProject
37-
--------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[ss_sold_date_sk]
37+
--------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[ws_sold_date_sk]
3838
----------------------------------------PhysicalProject
39-
------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF0
39+
------------------------------------------PhysicalOlapScan[web_sales] apply RFs: RF0
4040
----------------------------------------PhysicalProject
4141
------------------------------------------filter((date_dim.d_month_seq <= 1223) and (date_dim.d_month_seq >= 1212))
4242
--------------------------------------------PhysicalOlapScan[date_dim]

regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query45.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ PhysicalResultSink
99
------------hashAgg[LOCAL]
1010
--------------PhysicalProject
1111
----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
12-
------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk]
12+
------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk]
1313
--------------------PhysicalProject
1414
----------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=()
1515
------------------------PhysicalProject

regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query45.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ PhysicalResultSink
99
------------hashAgg[LOCAL]
1010
--------------PhysicalProject
1111
----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
12-
------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk]
12+
------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk]
1313
--------------------PhysicalProject
1414
----------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk]
1515
------------------------PhysicalProject

regression-test/data/new_shapes_p0/tpcds_sf100/rf_prune/query45.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ PhysicalResultSink
99
------------hashAgg[LOCAL]
1010
--------------PhysicalProject
1111
----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
12-
------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk]
12+
------------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk]
1313
--------------------PhysicalProject
1414
----------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=()
1515
------------------------PhysicalProject

regression-test/data/new_shapes_p0/tpcds_sf100/shape/query45.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ PhysicalResultSink
99
------------hashAgg[LOCAL]
1010
--------------PhysicalProject
1111
----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
12-
------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk]
12+
------------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk]
1313
--------------------PhysicalProject
1414
----------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk]
1515
------------------------PhysicalProject

regression-test/data/new_shapes_p0/tpcds_sf1000/bs_downgrade_shape/query45.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ PhysicalResultSink
99
------------hashAgg[LOCAL]
1010
--------------PhysicalProject
1111
----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
12-
------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk]
12+
------------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk]
1313
--------------------PhysicalProject
1414
----------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk]
1515
------------------------PhysicalProject

0 commit comments

Comments
 (0)