15
15
package expression
16
16
17
17
import (
18
+ "sync/atomic"
19
+
18
20
"github.com/pingcap/tidb/pkg/sessionctx"
19
21
"github.com/pingcap/tidb/pkg/util/chunk"
22
+ "github.com/pingcap/tidb/pkg/util/disjointset"
23
+ "github.com/pingcap/tidb/pkg/util/intest"
20
24
)
21
25
22
26
type columnEvaluator struct {
23
27
inputIdxToOutputIdxes map [int ][]int
28
+ // mergedInputIdxToOutputIdxes is only determined in runtime when saw the input chunk.
29
+ mergedInputIdxToOutputIdxes atomic.Pointer [map [int ][]int ]
24
30
}
25
31
26
32
// run evaluates "Column" expressions.
27
33
// NOTE: It should be called after all the other expressions are evaluated
28
34
//
29
35
// since it will change the content of the input Chunk.
30
36
func (e * columnEvaluator ) run (ctx sessionctx.Context , input , output * chunk.Chunk ) error {
31
- for inputIdx , outputIdxes := range e .inputIdxToOutputIdxes {
37
+ // mergedInputIdxToOutputIdxes only can be determined in runtime when we saw the input chunk structure.
38
+ if e .mergedInputIdxToOutputIdxes .Load () == nil {
39
+ e .mergeInputIdxToOutputIdxes (input , e .inputIdxToOutputIdxes )
40
+ }
41
+ for inputIdx , outputIdxes := range * e .mergedInputIdxToOutputIdxes .Load () {
32
42
if err := output .SwapColumn (outputIdxes [0 ], input , inputIdx ); err != nil {
33
43
return err
34
44
}
@@ -39,6 +49,93 @@ func (e *columnEvaluator) run(ctx sessionctx.Context, input, output *chunk.Chunk
39
49
return nil
40
50
}
41
51
52
+ // mergeInputIdxToOutputIdxes merges separate inputIdxToOutputIdxes entries when column references
53
+ // are detected within the input chunk. This process ensures consistent handling of columns derived
54
+ // from the same original source.
55
+ //
56
+ // Consider the following scenario:
57
+ //
58
+ // Initial scan operation produces a column 'a':
59
+ //
60
+ // scan: a (addr: ???)
61
+ //
62
+ // This column 'a' is used in the first projection (proj1) to create two columns a1 and a2, both referencing 'a':
63
+ //
64
+ // proj1
65
+ // / \
66
+ // / \
67
+ // / \
68
+ // a1 (addr: 0xe) a2 (addr: 0xe)
69
+ // / \
70
+ // / \
71
+ // / \
72
+ // proj2 proj2
73
+ // / \ / \
74
+ // / \ / \
75
+ // a3 a4 a5 a6
76
+ //
77
+ // (addr: 0xe) (addr: 0xe) (addr: 0xe) (addr: 0xe)
78
+ //
79
+ // Here, a1 and a2 share the same address (0xe), indicating they reference the same data from the original 'a'.
80
+ //
81
+ // When moving to the second projection (proj2), the system tries to project these columns further:
82
+ // - The first set (left side) consists of a3 and a4, derived from a1, both retaining the address (0xe).
83
+ // - The second set (right side) consists of a5 and a6, derived from a2, also starting with address (0xe).
84
+ //
85
+ // When proj1 is complete, the output chunk contains two columns [a1, a2], both derived from the single column 'a' from the scan.
86
+ // Since both a1 and a2 are column references with the same address (0xe), they are treated as referencing the same data.
87
+ //
88
+ // In proj2, two separate <inputIdx, []outputIdxes> items are created:
89
+ // - <0, [0,1]>: This means the 0th input column (a1) is projected twice, into the 0th and 1st columns of the output chunk.
90
+ // - <1, [2,3]>: This means the 1st input column (a2) is projected twice, into the 2nd and 3rd columns of the output chunk.
91
+ //
92
+ // Due to the column swapping logic in each projection, after applying the <0, [0,1]> projection,
93
+ // the addresses for a1 and a2 may become swapped or invalid:
94
+ //
95
+ // proj1: a1 (addr: invalid) a2 (addr: invalid)
96
+ //
97
+ // This can lead to issues in proj2, where further operations on these columns may be unsafe:
98
+ //
99
+ // proj2: a3 (addr: 0xe) a4 (addr: 0xe) a5 (addr: ???) a6 (addr: ???)
100
+ //
101
+ // Therefore, it's crucial to identify and merge the original column references early, ensuring
102
+ // the final inputIdxToOutputIdxes mapping accurately reflects the shared origins of the data.
103
+ // For instance, <0, [0,1,2,3]> indicates that the 0th input column (original 'a') is referenced
104
+ // by all four output columns in the final output.
105
+ //
106
+ // mergeInputIdxToOutputIdxes merges inputIdxToOutputIdxes based on detected column references.
107
+ // This ensures that columns with the same reference are correctly handled in the output chunk.
108
+ func (e * columnEvaluator ) mergeInputIdxToOutputIdxes (input * chunk.Chunk , inputIdxToOutputIdxes map [int ][]int ) {
109
+ originalDJSet := disjointset.NewSet [int ](4 )
110
+ flag := make ([]bool , input .NumCols ())
111
+ // Detect self column-references inside the input chunk by comparing column addresses
112
+ for i := 0 ; i < input .NumCols (); i ++ {
113
+ if flag [i ] {
114
+ continue
115
+ }
116
+ for j := i + 1 ; j < input .NumCols (); j ++ {
117
+ if input .Column (i ) == input .Column (j ) {
118
+ flag [j ] = true
119
+ originalDJSet .Union (i , j )
120
+ }
121
+ }
122
+ }
123
+ // Merge inputIdxToOutputIdxes based on the detected column references.
124
+ newInputIdxToOutputIdxes := make (map [int ][]int , len (inputIdxToOutputIdxes ))
125
+ for inputIdx := range inputIdxToOutputIdxes {
126
+ // Root idx is internal offset, not the right column index.
127
+ originalRootIdx := originalDJSet .FindRoot (inputIdx )
128
+ originalVal , ok := originalDJSet .FindVal (originalRootIdx )
129
+ intest .Assert (ok )
130
+ mergedOutputIdxes := newInputIdxToOutputIdxes [originalVal ]
131
+ mergedOutputIdxes = append (mergedOutputIdxes , inputIdxToOutputIdxes [inputIdx ]... )
132
+ newInputIdxToOutputIdxes [originalVal ] = mergedOutputIdxes
133
+ }
134
+ // Update the merged inputIdxToOutputIdxes automatically.
135
+ // Once failed, it means other worker has done this job at meantime.
136
+ e .mergedInputIdxToOutputIdxes .CompareAndSwap (nil , & newInputIdxToOutputIdxes )
137
+ }
138
+
42
139
type defaultEvaluator struct {
43
140
outputIdxes []int
44
141
exprs []Expression
0 commit comments