Skip to content

Commit 6a33993

Browse files
authored
config: must set line terminator when use strict-format (#53444) (#54939)
close #37338
1 parent b416986 commit 6a33993

File tree

12 files changed

+159
-6
lines changed

12 files changed

+159
-6
lines changed

lightning/pkg/importer/chunk_process.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -724,7 +724,7 @@ func (cr *chunkProcessor) deliverLoop(
724724
rc.status.FinishedFileSize.Add(delta)
725725
}
726726
} else {
727-
deliverLogger.Warn("offset go back", zap.Int64("curr", highOffset),
727+
deliverLogger.Error("offset go back", zap.Int64("curr", highOffset),
728728
zap.Int64("start", lowOffset))
729729
}
730730
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
11
[mydumper]
22
strict-format = true
33
max-region-size = 200
4+
5+
[mydumper.csv]
6+
terminator = "\n"

pkg/executor/importer/import.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -758,6 +758,10 @@ func (p *Plan) initOptions(ctx context.Context, seCtx sessionctx.Context, option
758758
return exeerrors.ErrInvalidOptionVal.FastGenByArgs("skip_rows, should be <= 1 when split-file is enabled")
759759
}
760760

761+
if p.SplitFile && len(p.LinesTerminatedBy) == 0 {
762+
return exeerrors.ErrInvalidOptionVal.FastGenByArgs("lines_terminated_by, should not be empty when use split_file")
763+
}
764+
761765
p.adjustOptions(targetNodeCPUCnt)
762766
return nil
763767
}

pkg/lightning/config/BUILD.bazel

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ go_test(
4242
],
4343
embed = [":config"],
4444
flaky = True,
45-
shard_count = 48,
45+
shard_count = 49,
4646
deps = [
4747
"@com_github_burntsushi_toml//:toml",
4848
"@com_github_stretchr_testify//require",

pkg/lightning/config/config.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -884,6 +884,11 @@ func (m *MydumperRuntime) adjust() error {
884884
if err := m.CSV.adjust(); err != nil {
885885
return err
886886
}
887+
if m.StrictFormat && len(m.CSV.Terminator) == 0 {
888+
return common.ErrInvalidConfig.GenWithStack(
889+
`mydumper.strict-format can not be used with empty mydumper.csv.terminator. Please set mydumper.csv.terminator to a non-empty value like "\r\n"`)
890+
}
891+
887892
for _, rule := range m.FileRouters {
888893
if filepath.IsAbs(rule.Path) {
889894
relPath, err := filepath.Rel(m.SourceDir, rule.Path)

pkg/lightning/config/config_test.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,30 @@ func TestAdjustPdAddrAndPort(t *testing.T) {
8181
require.Equal(t, "123.45.67.89:1234,56.78.90.12:3456", cfg.TiDB.PdAddr)
8282
}
8383

84+
func TestStrictFormat(t *testing.T) {
85+
ts, host, port := startMockServer(t, http.StatusOK,
86+
`{"port":4444,"advertise-address":"","path":"123.45.67.89:1234,56.78.90.12:3456"}`,
87+
)
88+
defer ts.Close()
89+
90+
cfg := NewConfig()
91+
cfg.TiDB.Host = host
92+
cfg.TiDB.StatusPort = port
93+
cfg.Mydumper.SourceDir = "."
94+
cfg.TikvImporter.Backend = BackendLocal
95+
cfg.TikvImporter.SortedKVDir = "."
96+
cfg.TiDB.DistSQLScanConcurrency = 1
97+
cfg.Mydumper.StrictFormat = true
98+
99+
err := cfg.Adjust(context.Background())
100+
require.ErrorContains(t, err, "mydumper.strict-format can not be used with empty mydumper.csv.terminator")
101+
t.Log(err.Error())
102+
103+
cfg.Mydumper.CSV.Terminator = "\r\n"
104+
err = cfg.Adjust(context.Background())
105+
require.NoError(t, err)
106+
}
107+
84108
func TestPausePDSchedulerScope(t *testing.T) {
85109
ts, host, port := startMockServer(t, http.StatusOK,
86110
`{"port":4444,"advertise-address":"","path":"123.45.67.89:1234,56.78.90.12:3456"}`,

pkg/lightning/mydump/csv_parser.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,8 @@ type field struct {
9797
quoted bool
9898
}
9999

100-
// NewCSVParser creates a CSV parser.
100+
// NewCSVParser creates a CSV parser. The ownership of the reader is transferred
101+
// to the parser.
101102
func NewCSVParser(
102103
ctx context.Context,
103104
cfg *config.CSVConfig,

pkg/lightning/mydump/region.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -416,13 +416,15 @@ func SplitLargeCSV(
416416
// Create a utf8mb4 convertor to encode and decode data with the charset of CSV files.
417417
charsetConvertor, err := NewCharsetConvertor(cfg.DataCharacterSet, cfg.DataInvalidCharReplace)
418418
if err != nil {
419+
_ = r.Close()
419420
return nil, nil, err
420421
}
421422
parser, err := NewCSVParser(ctx, &cfg.CSV, r, cfg.ReadBlockSize, cfg.IOWorkers, true, charsetConvertor)
422423
if err != nil {
423424
return nil, nil, err
424425
}
425426
if err = parser.ReadColumns(); err != nil {
427+
_ = parser.Close()
426428
return nil, nil, err
427429
}
428430
if cfg.CSV.HeaderSchemaMatch {
@@ -433,6 +435,7 @@ func SplitLargeCSV(
433435
if endOffset > dataFile.FileMeta.FileSize {
434436
endOffset = dataFile.FileMeta.FileSize
435437
}
438+
_ = parser.Close()
436439
}
437440
divisor := int64(cfg.ColumnCnt)
438441
for {
@@ -446,18 +449,21 @@ func SplitLargeCSV(
446449
// Create a utf8mb4 convertor to encode and decode data with the charset of CSV files.
447450
charsetConvertor, err := NewCharsetConvertor(cfg.DataCharacterSet, cfg.DataInvalidCharReplace)
448451
if err != nil {
452+
_ = r.Close()
449453
return nil, nil, err
450454
}
451455
parser, err := NewCSVParser(ctx, &cfg.CSV, r, cfg.ReadBlockSize, cfg.IOWorkers, false, charsetConvertor)
452456
if err != nil {
453457
return nil, nil, err
454458
}
455459
if err = parser.SetPos(endOffset, 0); err != nil {
460+
_ = parser.Close()
456461
return nil, nil, err
457462
}
458463
_, pos, err := parser.ReadUntilTerminator()
459464
if err != nil {
460465
if !errors.ErrorEqual(err, io.EOF) {
466+
_ = parser.Close()
461467
return nil, nil, err
462468
}
463469
log.FromContext(ctx).Warn("file contains no terminator at end",
@@ -466,7 +472,7 @@ func SplitLargeCSV(
466472
pos = dataFile.FileMeta.FileSize
467473
}
468474
endOffset = pos
469-
parser.Close()
475+
_ = parser.Close()
470476
}
471477
regions = append(regions,
472478
&TableRegion{

pkg/lightning/mydump/region_test.go

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -487,3 +487,98 @@ func TestSplitLargeFileOnlyOneChunk(t *testing.T) {
487487
require.Equal(t, columns, regions[i].Chunk.Columns)
488488
}
489489
}
490+
491+
func TestSplitLargeFileSeekInsideCRLF(t *testing.T) {
492+
ctx := context.Background()
493+
meta := &MDTableMeta{
494+
DB: "csv",
495+
Name: "large_csv_seek_inside_crlf",
496+
}
497+
498+
dir := t.TempDir()
499+
500+
fileName := "test.csv"
501+
filePath := filepath.Join(dir, fileName)
502+
503+
content := []byte("1\r\n2\r\n3\r\n4\r\n")
504+
err := os.WriteFile(filePath, content, 0o644)
505+
require.NoError(t, err)
506+
507+
dataFileInfo, err := os.Stat(filePath)
508+
require.NoError(t, err)
509+
fileSize := dataFileInfo.Size()
510+
fileInfo := FileInfo{FileMeta: SourceFileMeta{Path: fileName, Type: SourceTypeCSV, FileSize: fileSize}}
511+
ioWorker := worker.NewPool(context.Background(), 4, "io")
512+
513+
store, err := storage.NewLocalStorage(dir)
514+
require.NoError(t, err)
515+
516+
// if we don't set terminator, it will get the wrong result
517+
518+
cfg := &config.Config{
519+
Mydumper: config.MydumperRuntime{
520+
ReadBlockSize: config.ReadBlockSize,
521+
CSV: config.CSVConfig{
522+
Separator: ",",
523+
},
524+
StrictFormat: true,
525+
Filter: []string{"*.*"},
526+
MaxRegionSize: 2,
527+
},
528+
}
529+
divideConfig := NewDataDivideConfig(cfg, 1, ioWorker, store, meta)
530+
531+
// in fact this is the wrong result, just to show the bug. pos mismatch with
532+
// offsets. and we might read more rows than expected because we use == rather
533+
// than >= to stop reading.
534+
offsets := [][]int64{{0, 3}, {3, 6}, {6, 9}, {9, 12}}
535+
pos := []int64{2, 5, 8, 11}
536+
537+
regions, _, err := SplitLargeCSV(context.Background(), divideConfig, fileInfo)
538+
require.NoError(t, err)
539+
require.Len(t, regions, len(offsets))
540+
for i := range offsets {
541+
require.Equal(t, offsets[i][0], regions[i].Chunk.Offset)
542+
require.Equal(t, offsets[i][1], regions[i].Chunk.EndOffset)
543+
}
544+
545+
file, err := os.Open(filePath)
546+
require.NoError(t, err)
547+
parser, err := NewCSVParser(ctx, &cfg.Mydumper.CSV, file, 128, ioWorker, false, nil)
548+
require.NoError(t, err)
549+
550+
for parser.ReadRow() == nil {
551+
p, _ := parser.Pos()
552+
require.Equal(t, pos[0], p)
553+
pos = pos[1:]
554+
}
555+
require.NoError(t, parser.Close())
556+
557+
// set terminator to "\r\n"
558+
559+
cfg.Mydumper.CSV.Terminator = "\r\n"
560+
divideConfig = NewDataDivideConfig(cfg, 1, ioWorker, store, meta)
561+
// pos is contained in expectedOffsets
562+
expectedOffsets := [][]int64{{0, 6}, {6, 12}}
563+
pos = []int64{3, 6, 9, 12}
564+
565+
regions, _, err = SplitLargeCSV(context.Background(), divideConfig, fileInfo)
566+
require.NoError(t, err)
567+
require.Len(t, regions, len(expectedOffsets))
568+
for i := range expectedOffsets {
569+
require.Equal(t, expectedOffsets[i][0], regions[i].Chunk.Offset)
570+
require.Equal(t, expectedOffsets[i][1], regions[i].Chunk.EndOffset)
571+
}
572+
573+
file, err = os.Open(filePath)
574+
require.NoError(t, err)
575+
parser, err = NewCSVParser(ctx, &cfg.Mydumper.CSV, file, 128, ioWorker, false, nil)
576+
require.NoError(t, err)
577+
578+
for parser.ReadRow() == nil {
579+
p, _ := parser.Pos()
580+
require.Equal(t, pos[0], p)
581+
pos = pos[1:]
582+
}
583+
require.NoError(t, parser.Close())
584+
}

tests/integrationtest/r/executor/import_into.result

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ import into t from '/file.csv' with skip_rows=true;
7474
Error 8164 (HY000): Invalid option value for skip_rows
7575
import into t from '/file.csv' with split_file='aa';
7676
Error 8164 (HY000): Invalid option value for split_file
77+
import into t from '/file.csv' with split_file;
78+
Error 8164 (HY000): Invalid option value for lines_terminated_by, should not be empty when use split_file
7779
import into t from '/file.csv' with split_file, skip_rows=2;
7880
Error 8164 (HY000): Invalid option value for skip_rows, should be <= 1 when split-file is enabled
7981
import into t from '/file.csv' with disk_quota='aa';

0 commit comments

Comments
 (0)