66
66
#include < Storages/KVStore/TMTContext.h>
67
67
#include < Storages/KVStore/Utils/AsyncTasks.h>
68
68
#include < Storages/Page/V3/PageEntryCheckpointInfo.h>
69
+ #include < Storages/Page/V3/Universal/S3PageReader.h>
69
70
#include < Storages/Page/V3/Universal/UniversalPageIdFormatImpl.h>
70
71
#include < Storages/Page/V3/Universal/UniversalPageStorage.h>
71
72
#include < Storages/PathPool.h>
@@ -477,7 +478,7 @@ Segment::SegmentMetaInfos Segment::readAllSegmentsMetaInfoInRange( //
477
478
478
479
auto end_to_segment_id_cache = checkpoint_info->checkpoint_data_holder ->getEndToSegmentIdCache (
479
480
KeyspaceTableID{context.keyspace_id , context.physical_table_id });
480
-
481
+ bool use_cache = context. fap_use_segment_to_end_map_cache ;
481
482
// Protected by whatever lock.
482
483
auto build_segments = [&](bool is_cache_ready, PageIdU64 current_segment_id)
483
484
-> std::optional<std::pair<std::vector<std::pair<DM::RowKeyValue, UInt64>>, SegmentMetaInfos>> {
@@ -490,19 +491,34 @@ Segment::SegmentMetaInfos Segment::readAllSegmentsMetaInfoInRange( //
490
491
// The map is used to build cache.
491
492
std::vector<std::pair<DM::RowKeyValue, UInt64>> end_key_and_segment_ids;
492
493
SegmentMetaInfos segment_infos;
494
+ ReadBufferFromRandomAccessFilePtr reusable_buf = nullptr ;
495
+ size_t total_processed_segments = 0 ;
496
+ size_t total_skipped_segments = 0 ;
497
+ PS::V3::S3PageReader::ReuseStatAgg reused_agg;
498
+ // TODO If the regions are added in a slower rate, the cache may not be reused even if the TiFlash region replicas are always added in one table as a whole.
499
+ // This is because later added regions could use later checkpoints. So, there could be another optimization to avoid generating the cache.
493
500
while (current_segment_id != 0 )
494
501
{
495
502
if (cancel_handle->isCanceled ())
496
503
{
497
- LOG_INFO (log, " FAP is canceled when building segments, built={}" , end_key_and_segment_ids.size ());
504
+ LOG_INFO (
505
+ log,
506
+ " FAP is canceled when building segments, built={}, total_processed_segments={} "
507
+ " total_skipped_segments={} reused_agg={}" ,
508
+ end_key_and_segment_ids.size (),
509
+ total_processed_segments,
510
+ total_skipped_segments,
511
+ reused_agg.toString ());
498
512
// FAP task would be cleaned in FastAddPeerImplWrite. So returning empty result is OK.
499
513
return std::nullopt;
500
514
}
501
515
Segment::SegmentMetaInfo segment_info;
502
516
auto target_id = UniversalPageIdFormat::toFullPageId (
503
517
UniversalPageIdFormat::toFullPrefix (context.keyspace_id , StorageType::Meta, context.physical_table_id ),
504
518
current_segment_id);
505
- auto page = checkpoint_info->temp_ps ->read (target_id, nullptr , {}, false );
519
+ PS::V3::S3PageReader::ReuseStat reason = PS::V3::S3PageReader::ReuseStat::Reused;
520
+ auto page = checkpoint_info->temp_ps ->read (target_id, nullptr , {}, false , reusable_buf, reason);
521
+ reused_agg.observe (reason);
506
522
if unlikely (!page.isValid ())
507
523
{
508
524
// After #7642, DELTA_MERGE_FIRST_SEGMENT_ID may not exist, however, such checkpoint won't be selected.
@@ -519,6 +535,7 @@ Segment::SegmentMetaInfos Segment::readAllSegmentsMetaInfoInRange( //
519
535
readSegmentMetaInfo (buf, segment_info);
520
536
if (!is_cache_ready)
521
537
{
538
+ FAIL_POINT_PAUSE (FailPoints::pause_when_building_fap_segments);
522
539
end_key_and_segment_ids.emplace_back (
523
540
segment_info.range .getEnd ().toRowKeyValue (),
524
541
segment_info.segment_id );
@@ -528,61 +545,95 @@ Segment::SegmentMetaInfos Segment::readAllSegmentsMetaInfoInRange( //
528
545
{
529
546
segment_infos.emplace_back (segment_info);
530
547
}
531
- // if not build cache, stop as early as possible.
532
- if (is_cache_ready && segment_info.range .end .value ->compare (*target_range.end .value ) >= 0 )
548
+ else
533
549
{
534
- break ;
550
+ total_skipped_segments++ ;
535
551
}
552
+ if (segment_info.range .end .value ->compare (*target_range.end .value ) >= 0 )
553
+ {
554
+ // if not build cache, stop as early as possible.
555
+ if (is_cache_ready)
556
+ break ;
557
+ }
558
+ total_processed_segments++;
536
559
}
560
+ LOG_INFO (
561
+ log,
562
+ " Finish building segments, target_range={} infos_size={} total_processed_segments={} "
563
+ " total_skipped_segments={} reused_agg={} use_cache={}" ,
564
+ target_range.toDebugString (),
565
+ segment_infos.size (),
566
+ total_processed_segments,
567
+ total_skipped_segments,
568
+ reused_agg.toString (),
569
+ use_cache);
537
570
return std::make_pair (end_key_and_segment_ids, segment_infos);
538
571
};
539
572
573
+ if (use_cache)
540
574
{
541
- // If there is a table building cache, then other table may block to read the built cache.
542
- // If the remote reader causes much time to retrieve data, then these tasks could block here.
543
- // However, when the execlusive holder is canceled due to timeout, the readers could eventually get the lock.
544
- auto lock = end_to_segment_id_cache->writeLock ();
545
- // - Set to `true`: The building task is done.
546
- // - Set to `false`: It is not build yet, or it is building.
547
- bool is_cache_ready = end_to_segment_id_cache->isReady (lock);
548
- GET_METRIC (tiflash_fap_task_duration_seconds, type_write_stage_wait_build)
549
- .Observe (sw.elapsedSecondsFromLastTime ());
550
-
551
- if (!is_cache_ready)
575
+ LOG_DEBUG (log, " Start read all segments meta info by cache" );
552
576
{
553
- // We are the cache builder.
554
- FAIL_POINT_PAUSE (FailPoints::pause_when_building_fap_segments);
577
+ // If there is a table building cache, then other table may block to read the built cache.
578
+ // If the remote reader causes much time to retrieve data, then these tasks could block here.
579
+ // However, when the exclusive holder is canceled due to timeout, the readers could eventually get the lock.
580
+ auto lock = end_to_segment_id_cache->writeLock ();
581
+ // - Set to `true`: The building task is done.
582
+ // - Set to `false`: It is not build yet, or it is building.
583
+ bool is_cache_ready = end_to_segment_id_cache->isReady (lock);
584
+ GET_METRIC (tiflash_fap_task_duration_seconds, type_write_stage_wait_build)
585
+ .Observe (sw.elapsedSecondsFromLastTime ());
555
586
556
- auto res = build_segments (is_cache_ready, DELTA_MERGE_FIRST_SEGMENT_ID);
557
- // After all segments are scanned, we try to build a cache,
558
- // so other FAP tasks that share the same checkpoint could reuse the cache.
587
+ if (!is_cache_ready)
588
+ {
589
+ // We are the cache builder.
590
+
591
+ auto res = build_segments (is_cache_ready, DELTA_MERGE_FIRST_SEGMENT_ID);
592
+ // After all segments are scanned, we try to build a cache,
593
+ // so other FAP tasks that share the same checkpoint could reuse the cache.
594
+ if (!res)
595
+ return {};
596
+ auto & [end_key_and_segment_ids, segment_infos] = *res;
597
+ LOG_DEBUG (
598
+ log,
599
+ " Segment meta info cache has been built, num_segments={}" ,
600
+ end_key_and_segment_ids.size ());
601
+ end_to_segment_id_cache->build (lock, std::move (end_key_and_segment_ids));
602
+ return std::move (segment_infos);
603
+ }
604
+ }
605
+ {
606
+ // If we found the cache is built, which could be normal cases when the checkpoint is reused.
607
+ auto lock = end_to_segment_id_cache->readLock ();
608
+ bool is_cache_ready = end_to_segment_id_cache->isReady (lock);
609
+ RUNTIME_CHECK (is_cache_ready, checkpoint_info->region_id , context.keyspace_id , context.physical_table_id );
610
+ GET_METRIC (tiflash_fap_task_result, type_reuse_chkpt_cache).Increment ();
611
+ // ... then we could seek to `current_segment_id` in cache to avoid some read.
612
+ auto current_segment_id
613
+ = end_to_segment_id_cache->getSegmentIdContainingKey (lock, target_range.getStart ().toRowKeyValue ());
614
+ auto res = build_segments (is_cache_ready, current_segment_id);
559
615
if (!res)
560
616
return {};
561
- auto & [end_key_and_segment_ids, segment_infos] = *res;
562
- LOG_DEBUG (log, " Segment meta info cache has been built, num_segments={}" , end_key_and_segment_ids.size ());
563
- end_to_segment_id_cache->build (lock, std::move (end_key_and_segment_ids));
564
- return std::move (segment_infos);
617
+ return std::move (res->second );
565
618
}
566
619
}
620
+ else
567
621
{
568
- // If we found the cache is built, which could be normal cases when the checkpoint is reused.
569
- auto lock = end_to_segment_id_cache->readLock ();
570
- bool is_cache_ready = end_to_segment_id_cache->isReady (lock);
571
- RUNTIME_CHECK (is_cache_ready, checkpoint_info->region_id , context.keyspace_id , context.physical_table_id );
572
- GET_METRIC (tiflash_fap_task_result, type_reuse_chkpt_cache).Increment ();
573
- // ... then we could seek to `current_segment_id` in cache to avoid some read.
574
- auto current_segment_id
575
- = end_to_segment_id_cache->getSegmentIdContainingKey (lock, target_range.getStart ().toRowKeyValue ());
576
- auto res = build_segments (is_cache_ready, current_segment_id);
622
+ LOG_DEBUG (log, " Start read all segments meta info by direct" );
623
+ // Set `is_cache_ready == true` to let `build_segments` return once it finds all
624
+ // overlapped segments
625
+ auto res = build_segments (true , DELTA_MERGE_FIRST_SEGMENT_ID);
577
626
if (!res)
578
627
return {};
579
- return std::move (res->second );
628
+ auto & [_end_key_and_segment_ids, segment_infos] = *res;
629
+ UNUSED (_end_key_and_segment_ids);
630
+ return std::move (segment_infos);
580
631
}
581
632
582
633
if (cancel_handle->isCanceled ())
583
634
{
584
635
LOG_INFO (log, " FAP is canceled when building segments" );
585
- // FAP task would be cleaned in FastAddPeerImplWrite. So returning incompelete result could be OK.
636
+ // FAP task would be cleaned in FastAddPeerImplWrite. So returning incomplete result could be OK.
586
637
return {};
587
638
}
588
639
}
0 commit comments