@@ -17,6 +17,7 @@ package taskexecutor
17
17
import (
18
18
"bytes"
19
19
"context"
20
+ "runtime"
20
21
"sync"
21
22
"sync/atomic"
22
23
"time"
46
47
// updateSubtaskSummaryInterval is the interval for updating the subtask summary to
47
48
// subtask table.
48
49
updateSubtaskSummaryInterval = 3 * time .Second
50
+ // DetectParamModifyInterval is the interval to detect whether task params
51
+ // are modified.
52
+ // exported for testing.
53
+ DetectParamModifyInterval = 5 * time .Second
49
54
)
50
55
51
56
var (
@@ -78,6 +83,10 @@ func NewParamForTest(taskTable TaskTable, slotMgr *slotManager, nodeRc *NodeReso
78
83
// BaseTaskExecutor is the base implementation of TaskExecutor.
79
84
type BaseTaskExecutor struct {
80
85
Param
86
+ // task is a local state that periodically aligned with what's saved in system
87
+ // table, but if the task has modified params, it might be updated in memory
88
+ // to reflect that some param modification have been applied successfully,
89
+ // see detectAndHandleParamModifyLoop for more detail.
81
90
task atomic.Pointer [proto.Task ]
82
91
logger * zap.Logger
83
92
ctx context.Context
@@ -256,16 +265,16 @@ func (e *BaseTaskExecutor) Run() {
256
265
}
257
266
258
267
if ! bytes .Equal (oldTask .Meta , newTask .Meta ) {
259
- e .logger .Info ("task meta modified " ,
268
+ e .logger .Info ("task meta modification applied " ,
260
269
zap .String ("oldStep" , proto .Step2Str (oldTask .Type , oldTask .Step )),
261
270
zap .String ("newStep" , proto .Step2Str (newTask .Type , newTask .Step )))
262
271
// when task switch to next step, task meta might change too, but in
263
272
// this case step executor will be recreated with new concurrency and
264
273
// meta, so we only notify it when it's still running the same step.
265
274
if e .stepExec != nil && e .stepExec .GetStep () == newTask .Step {
266
275
e .logger .Info ("notify step executor to update task meta" )
267
- if err2 := e .stepExec .TaskMetaModified (newTask ); err2 != nil {
268
- e .logger .Info ("notify step executor failed, will recreate it" , zap .Error (err2 ))
276
+ if err2 := e .stepExec .TaskMetaModified (e . stepCtx , newTask . Meta ); err2 != nil {
277
+ e .logger .Info ("notify step executor failed, will try recreate it later " , zap .Error (err2 ))
269
278
e .cleanStepExecutor ()
270
279
continue
271
280
}
@@ -277,7 +286,7 @@ func (e *BaseTaskExecutor) Run() {
277
286
zap .Int ("old" , oldTask .Concurrency ), zap .Int ("new" , newTask .Concurrency ))
278
287
return
279
288
}
280
- e .logger .Info ("task concurrency modified " ,
289
+ e .logger .Info ("task concurrency modification applied " ,
281
290
zap .Int ("old" , oldTask .Concurrency ), zap .Int ("new" , newTask .Concurrency ),
282
291
zap .Int ("availableSlots" , e .slotMgr .availableSlots ()))
283
292
newResource := e .nodeRc .getStepResource (newTask .Concurrency )
@@ -440,6 +449,9 @@ func (e *BaseTaskExecutor) runSubtask(subtask *proto.Subtask) (resErr error) {
440
449
e .updateSubtaskSummaryLoop (checkCtx , subtaskCtx , e .stepExec )
441
450
})
442
451
}
452
+ wg .RunWithLog (func () {
453
+ e .detectAndHandleParamModifyLoop (checkCtx )
454
+ })
443
455
defer func () {
444
456
checkCancel ()
445
457
wg .Wait ()
@@ -469,6 +481,125 @@ func (e *BaseTaskExecutor) hasRealtimeSummary(stepExecutor execute.StepExecutor)
469
481
return ok && stepExecutor .RealtimeSummary () != nil
470
482
}
471
483
484
+ // there are 2 places that will detect task param modification:
485
+ // - Run loop to make 'modifies' apply to all later subtasks
486
+ // - this loop to try to make 'modifies' apply to current running subtask
487
+ //
488
+ // for a single step executor, successfully applied 'modifies' will not be applied
489
+ // again, failed ones will be retried in this loop. To achieve this, we will update
490
+ // the task inside BaseTaskExecutor to reflect the 'modifies' that have applied
491
+ // successfully. the 'modifies' that failed to apply in this loop will be retried
492
+ // in the Run loop.
493
+ func (e * BaseTaskExecutor ) detectAndHandleParamModifyLoop (ctx context.Context ) {
494
+ ticker := time .NewTicker (DetectParamModifyInterval )
495
+ defer ticker .Stop ()
496
+ for {
497
+ select {
498
+ case <- ctx .Done ():
499
+ return
500
+ case <- ticker .C :
501
+ }
502
+
503
+ err := e .detectAndHandleParamModify (ctx )
504
+ if err != nil {
505
+ if ctx .Err () != nil {
506
+ return
507
+ }
508
+ e .logger .Warn ("failed to detect and handle param modification" ,
509
+ zap .Int64 ("currSubtaskID" , e .currSubtaskID .Load ()), zap .Error (err ))
510
+ }
511
+ }
512
+ }
513
+
514
+ func (e * BaseTaskExecutor ) detectAndHandleParamModify (ctx context.Context ) error {
515
+ oldTask := e .task .Load ()
516
+ latestTask , err := e .taskTable .GetTaskByID (ctx , oldTask .ID )
517
+ if err != nil {
518
+ return err
519
+ }
520
+
521
+ metaModified := ! bytes .Equal (latestTask .Meta , oldTask .Meta )
522
+ if latestTask .Concurrency == oldTask .Concurrency && ! metaModified {
523
+ return nil
524
+ }
525
+
526
+ e .logger .Info ("task param modification detected" ,
527
+ zap .Int64 ("currSubtaskID" , e .currSubtaskID .Load ()),
528
+ zap .Bool ("metaModified" , metaModified ),
529
+ zap .Int ("oldConcurrency" , oldTask .Concurrency ),
530
+ zap .Int ("newConcurrency" , latestTask .Concurrency ))
531
+
532
+ // we don't report error here, as we might fail to modify task concurrency due
533
+ // to not enough slots, we still need try to apply meta modification.
534
+ e .tryModifyTaskConcurrency (ctx , oldTask , latestTask )
535
+ if metaModified {
536
+ if err := e .stepExec .TaskMetaModified (ctx , latestTask .Meta ); err != nil {
537
+ return errors .Annotate (err , "failed to apply task param modification" )
538
+ }
539
+ e .metaModifyApplied (latestTask .Meta )
540
+ }
541
+ return nil
542
+ }
543
+
544
+ func (e * BaseTaskExecutor ) tryModifyTaskConcurrency (ctx context.Context , oldTask , latestTask * proto.Task ) {
545
+ logger := e .logger .With (zap .Int64 ("currSubtaskID" , e .currSubtaskID .Load ()),
546
+ zap .Int ("old" , oldTask .Concurrency ), zap .Int ("new" , latestTask .Concurrency ))
547
+ if latestTask .Concurrency < oldTask .Concurrency {
548
+ // we need try to release the resource first, then free slots, to avoid
549
+ // OOM when manager starts other task executor and start to allocate memory
550
+ // immediately.
551
+ newResource := e .nodeRc .getStepResource (latestTask .Concurrency )
552
+ if err := e .stepExec .ResourceModified (ctx , newResource ); err != nil {
553
+ logger .Warn ("failed to reduce resource usage" , zap .Error (err ))
554
+ return
555
+ }
556
+ if ! e .slotMgr .exchange (& latestTask .TaskBase ) {
557
+ // we are returning resource back, should not happen
558
+ logger .Warn ("failed to free slots" )
559
+ intest .Assert (false , "failed to return slots" )
560
+ return
561
+ }
562
+
563
+ // after application reduced memory usage, the garbage might not recycle
564
+ // in time, so we trigger GC here.
565
+ //nolint: revive
566
+ runtime .GC ()
567
+ e .concurrencyModifyApplied (latestTask .Concurrency )
568
+ } else if latestTask .Concurrency > oldTask .Concurrency {
569
+ exchanged := e .slotMgr .exchange (& latestTask .TaskBase )
570
+ if ! exchanged {
571
+ logger .Info ("failed to exchange slots" , zap .Int ("availableSlots" , e .slotMgr .availableSlots ()))
572
+ return
573
+ }
574
+ newResource := e .nodeRc .getStepResource (latestTask .Concurrency )
575
+ if err := e .stepExec .ResourceModified (ctx , newResource ); err != nil {
576
+ exchanged := e .slotMgr .exchange (& oldTask .TaskBase )
577
+ intest .Assert (exchanged , "failed to return slots" )
578
+ logger .Warn ("failed to increase resource usage, return slots back" , zap .Error (err ),
579
+ zap .Int ("availableSlots" , e .slotMgr .availableSlots ()), zap .Bool ("exchanged" , exchanged ))
580
+ return
581
+ }
582
+
583
+ e .concurrencyModifyApplied (latestTask .Concurrency )
584
+ }
585
+ }
586
+
587
+ func (e * BaseTaskExecutor ) concurrencyModifyApplied (newConcurrency int ) {
588
+ clone := * e .task .Load ()
589
+ e .logger .Info ("task concurrency modification applied" ,
590
+ zap .Int64 ("currSubtaskID" , e .currSubtaskID .Load ()), zap .Int ("old" , clone .Concurrency ),
591
+ zap .Int ("new" , newConcurrency ), zap .Int ("availableSlots" , e .slotMgr .availableSlots ()))
592
+ clone .Concurrency = newConcurrency
593
+ e .task .Store (& clone )
594
+ }
595
+
596
+ func (e * BaseTaskExecutor ) metaModifyApplied (newMeta []byte ) {
597
+ e .logger .Info ("task meta modification applied" , zap .Int64 ("currSubtaskID" , e .currSubtaskID .Load ()))
598
+ clone := * e .task .Load ()
599
+ clone .Meta = newMeta
600
+ e .task .Store (& clone )
601
+ }
602
+
472
603
// GetTaskBase implements TaskExecutor.GetTaskBase.
473
604
func (e * BaseTaskExecutor ) GetTaskBase () * proto.TaskBase {
474
605
task := e .task .Load ()
0 commit comments