Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 84 additions & 23 deletions pkg/executor/slow_query.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import (
"io"
"os"
"path/filepath"
"regexp"
"runtime"
"slices"
"strconv"
Expand Down Expand Up @@ -523,32 +522,94 @@ func getLineIndex(offset offset, index int) int {
return fileLine
}

// kvSplitRegex: it was just for split "field: value field: value..."
var kvSplitRegex = regexp.MustCompile(`\w+: `)
// findMatchedRightBracket returns the rightBracket index which matchs line[leftBracketIdx]
// leftBracketIdx should be valid string index for line
// Returns -1 if invalid inputs are given
func findMatchedRightBracket(line string, leftBracketIdx int) int {
leftBracket := line[leftBracketIdx]
rightBracket := byte('}')
if leftBracket == '[' {
rightBracket = ']'
} else if leftBracket != '{' {
return -1
}
current := leftBracketIdx
leftBracketCnt := 0
for current < len(line) {
b := line[current]
if b == leftBracket {
leftBracketCnt++
current++
} else if b == rightBracket {
leftBracketCnt--
if leftBracketCnt > 0 {
current++
} else if leftBracketCnt == 0 {
return current
} else {
return -1
}
} else {
current++
}
}
return -1
}

func isLetterOrNumeric(b byte) bool {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we can put this function in pkg/util/stringutil file.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just leave it in the local file, since it may be changed for slow log parsing only.

return ('A' <= b && b <= 'Z') || ('a' <= b && b <= 'z') || ('0' <= b && b <= '9')
}

// splitByColon split a line like "field: value field: value..."
// Note:
// 1. Both field and value string contain only ANSI characters
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is too restricted. E.g. tidb_redact_log may output non-ansi characters later sometime.
I am thinking if we could just use strings.Index(a, b) to mimic the original regexp.

Copy link
Contributor Author

@yibin87 yibin87 Jul 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, this rule can be removed for current algorithm, I'll change it.
Simply use strings.Index(a, b) can't solve the ": " inside "{}" cases.
BTW, splitByColon is not the root entry for slow log parsing, for logs that may contain non-ansi characters we can handle it separately if we doneed it.

Copy link
Contributor Author

@yibin87 yibin87 Jul 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Checked that ascii characters can't be a part of a valid non-ascii utf8 character, since multiple byte charater always has 110/1110 prefix. Thus both field and value string can contain non ascii characters. Besides, using rune[] will affect performance a lot. So keep the previous algorithm, just update the restriction comments:
For field string, first character should be ascii letters or digits. For value string, whitespace can only be contained inside "{}"/"[]".

// 2. value string may be surrounded by brackets, allowed brackets includes "[]" and "{}", like {key: value,{key: value}}
// "[]" can only be nested inside "[]"; "{}" can only be nested inside "{}"
// 3. value string can't contain ' ' character unless it is inside brackets
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is origin implementation also contains these restriction?

Copy link
Contributor Author

@yibin87 yibin87 Jul 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, while current slow log satisfies these restrictions and it doesn't seem to be changed frequently in future. Previous implementation provides a broader functionality.

func splitByColon(line string) (fields []string, values []string) {
matches := kvSplitRegex.FindAllStringIndex(line, -1)
fields = make([]string, 0, len(matches))
values = make([]string, 0, len(matches))

beg := 0
end := 0
for _, match := range matches {
// trim ": "
fields = append(fields, line[match[0]:match[1]-2])

end = match[0]
if beg != 0 {
// trim " "
values = append(values, line[beg:end-1])
}
beg = match[1]
fields = make([]string, 0, 1)
values = make([]string, 0, 1)

parseKey := true
start := 0
errMsg := ""
for current := 0; current < len(line); {
if parseKey {
// Find key start
for current < len(line) && !isLetterOrNumeric(line[current]) {
current++
}
start = current
if current >= len(line) {
break
}
for current < len(line) && line[current] != ':' {
current++
}
fields = append(fields, line[start:current])
parseKey = false
current += 2 // bypass ": "
} else {
start = current
if current < len(line) && (line[current] == '{' || line[current] == '[') {
rBraceIdx := findMatchedRightBracket(line, current)
if rBraceIdx == -1 {
errMsg = "Unmatched left brace"
break
}
current = rBraceIdx + 1
} else {
for current < len(line) && line[current] != ' ' {
current++
}
}
values = append(values, line[start:min(current, len(line))])
parseKey = true
}
}

if end != len(line) {
// " " does not exist in the end
values = append(values, line[beg:])
if len(errMsg) > 0 {
logutil.BgLogger().Warn("slow query parse slow log error", zap.String("Error", errMsg), zap.String("Log", line))
return nil, nil
}
return fields, values
}
Expand Down
22 changes: 21 additions & 1 deletion pkg/executor/slow_query_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -532,8 +532,8 @@ func TestSplitbyColon(t *testing.T) {
},
{
"123a",
[]string{},
[]string{"123a"},
[]string{},
},
{
"1a: 2b",
Expand All @@ -550,6 +550,26 @@ func TestSplitbyColon(t *testing.T) {
[]string{"1a", "4d"},
[]string{"[2b,3c]", "5e"},
},
{
"1a: [2b,[3c: 3cc]] 4d: 5e",
[]string{"1a", "4d"},
[]string{"[2b,[3c: 3cc]]", "5e"},
},
{
"1a: {2b 3c} 4d: 5e",
[]string{"1a", "4d"},
[]string{"{2b 3c}", "5e"},
},
{
"1a: {2b,3c} 4d: 5e",
[]string{"1a", "4d"},
[]string{"{2b,3c}", "5e"},
},
{
"1a: {2b,{3c: 3cc}} 4d: 5e",
[]string{"1a", "4d"},
[]string{"{2b,{3c: 3cc}}", "5e"},
},
{

"Time: 2021-09-08T14:39:54.506967433+08:00",
Expand Down