fix(stream-generator): Split create/keep-alive streams routines (#17815)

periklis · web-flow · commit ec5599c593fc · 2025-06-02T11:18:07.000+02:00
diff --git a/tools/stream-generator/generator/config.go b/tools/stream-generator/generator/config.go
@@ -40,8 +40,8 @@ type Config struct {
 	NumTenants                int           `yaml:"num_tenants"`
 	TenantPrefix              string        `yaml:"tenant_prefix"`
 	QPSPerTenant              int           `yaml:"qps_per_tenant"`
-	BatchSize                 int           `yaml:"batch_size"`
-	BatchInterval             time.Duration `yaml:"batch_interval"`
+	CreateBatchSize           int           `yaml:"create_batch_size"`
+	CreateNewStreamsInterval  time.Duration `yaml:"create_new_streams_interval"`
 	StreamsPerTenant          int           `yaml:"streams_per_tenant"`
 	StreamLabels              []string      `yaml:"stream_labels"`
 	MaxGlobalStreamsPerTenant int           `yaml:"max_global_streams_per_tenant"`
@@ -84,8 +84,8 @@ func (c *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) {
 	f.IntVar(&c.NumTenants, "tenants.total", 1, "Number of tenants to generate metadata for")
 	f.StringVar(&c.TenantPrefix, "tenants.prefix", "", "Prefix for tenant IDs")
 	f.IntVar(&c.QPSPerTenant, "tenants.qps", 10, "Number of QPS per tenant")
-	f.IntVar(&c.BatchSize, "tenants.streams.batch-size", 100, "Number of streams to send to Kafka per tick")
-	f.DurationVar(&c.BatchInterval, "tenants.streams.batch-interval", 1*time.Minute, "Number of milliseconds to wait between batches. If set to 0, it will be calculated based on QPSPerTenant.")
+	f.IntVar(&c.CreateBatchSize, "tenants.streams.create-batch-size", 100, "Number of streams to send to Kafka per tick")
+	f.DurationVar(&c.CreateNewStreamsInterval, "tenants.streams.create-interval", 1*time.Minute, "Number of milliseconds to wait between batches. If set to 0, it will be calculated based on QPSPerTenant.")
 	f.IntVar(&c.StreamsPerTenant, "tenants.streams.total", 100, "Number of streams per tenant")
 	f.IntVar(&c.MaxGlobalStreamsPerTenant, "tenants.max-global-streams", 1000, "Maximum number of global streams per tenant")
 	f.IntVar(&c.HTTPListenPort, "http-listen-port", 3100, "HTTP Listener port")
@@ -116,8 +116,8 @@ func (c *Config) Validate() error {
 	}
 	c.PushMode = PushModeType(c.pushModeRaw)
 
-	if c.BatchInterval <= 0 {
-		c.BatchInterval = time.Second / time.Duration(c.QPSPerTenant)
+	if c.CreateNewStreamsInterval <= 0 {
+		c.CreateNewStreamsInterval = time.Second / time.Duration(c.QPSPerTenant)
 	}
 
 	return nil
diff --git a/tools/stream-generator/generator/distributor.go b/tools/stream-generator/generator/distributor.go
@@ -4,16 +4,17 @@ import (
 	"context"
 	"fmt"
 
-	"github.com/go-kit/log/level"
 	"github.com/grafana/dskit/user"
 	"github.com/grafana/loki/v3/pkg/distributor"
 	"github.com/grafana/loki/v3/pkg/logproto"
 )
 
-func (s *Generator) sendStreams(ctx context.Context, batch []distributor.KeyedStream, streamIdx int, batchSize int, tenant string, errCh chan error) {
+func (s *Generator) sendStreams(ctx context.Context, tenant string, batch []distributor.KeyedStream, errCh chan<- error) {
+	batchSize := len(batch)
+
 	userCtx, err := user.InjectIntoGRPCRequest(user.InjectOrgID(ctx, tenant))
 	if err != nil {
-		errCh <- fmt.Errorf("failed to inject user context (tenant: %s, stream_idx: %d, batch_size: %d): %w", tenant, streamIdx, batchSize, err)
+		errCh <- fmt.Errorf("failed to inject user context (tenant: %s, batch_size: %d): %w", tenant, batchSize, err)
 		return
 	}
 
@@ -31,9 +32,7 @@ func (s *Generator) sendStreams(ctx context.Context, batch []distributor.KeyedSt
 
 	_, err = s.distributorClient.Push(userCtx, pushReq)
 	if err != nil {
-		errCh <- fmt.Errorf("failed to push streams to distributor (tenant: %s, stream_idx: %d, batch_size: %d): %w", tenant, streamIdx, batchSize, err)
+		errCh <- fmt.Errorf("failed to push streams to distributor (tenant: %s, batch_size: %d): %w", tenant, batchSize, err)
 		return
 	}
-
-	level.Debug(s.logger).Log("msg", "Sent streams to distributor", "tenant", tenant, "batch_size", batchSize, "stream_idx", streamIdx)
 }
diff --git a/tools/stream-generator/generator/kafka.go b/tools/stream-generator/generator/kafka.go
@@ -20,21 +20,23 @@ import (
 	"github.com/twmb/franz-go/pkg/kgo"
 )
 
-func (s *Generator) sendStreamMetadata(ctx context.Context, streamsBatch []distributor.KeyedStream, streamIdx int, batchSize int, tenant string, errCh chan error) {
+func (s *Generator) sendStreamMetadata(ctx context.Context, tenant string, batch []distributor.KeyedStream, errCh chan<- error) {
+	batchSize := len(batch)
+
 	client, err := s.getFrontendClient()
 	if err != nil {
-		errCh <- fmt.Errorf("failed to get ingest limits frontend client (tenant: %s, stream_idx: %d, batch_size: %d): %w", tenant, streamIdx, batchSize, err)
+		errCh <- fmt.Errorf("failed to get ingest limits frontend client (tenant: %s, batch_size: %d): %w", tenant, batchSize, err)
 		return
 	}
 
 	userCtx, err := user.InjectIntoGRPCRequest(user.InjectOrgID(ctx, tenant))
 	if err != nil {
-		errCh <- fmt.Errorf("failed to inject user context (tenant: %s, stream_idx: %d, batch_size: %d): %w", tenant, streamIdx, batchSize, err)
+		errCh <- fmt.Errorf("failed to inject user context (tenant: %s, batch_size: %d): %w", tenant, batchSize, err)
 		return
 	}
 
 	var streamMetadata []*proto.StreamMetadata
-	for _, stream := range streamsBatch {
+	for _, stream := range batch {
 		streamMetadata = append(streamMetadata, &proto.StreamMetadata{
 			StreamHash: stream.HashKeyNoShard,
 		})
@@ -47,13 +49,13 @@ func (s *Generator) sendStreamMetadata(ctx context.Context, streamsBatch []distr
 
 	// Check if the stream exceeds limits
 	if client == nil {
-		errCh <- fmt.Errorf("no ingest limits frontend client (tenant: %s, stream_idx: %d, batch_size: %d)", tenant, streamIdx, batchSize)
+		errCh <- fmt.Errorf("no ingest limits frontend client (tenant: %s, batch_size: %d)", tenant, batchSize)
 		return
 	}
 
 	resp, err := client.ExceedsLimits(userCtx, req)
 	if err != nil {
-		errCh <- fmt.Errorf("failed to check if stream exceeds limits (tenant: %s, stream_idx: %d, batch_size: %d): %w", tenant, streamIdx, batchSize, err)
+		errCh <- fmt.Errorf("failed to check if stream exceeds limits (tenant: %s, batch_size: %d): %w", tenant, batchSize, err)
 		return
 	}
 
@@ -69,18 +71,17 @@ func (s *Generator) sendStreamMetadata(ctx context.Context, streamsBatch []distr
 			results += fmt.Sprintf("%s: %d, ", reason, count)
 		}
 
-		level.Info(s.logger).Log("msg", "Stream exceeds limits", "tenant", tenant, "batch_size", batchSize, "stream_idx", streamIdx, "rejected", results)
+		level.Info(s.logger).Log("msg", "Stream exceeds limits", "tenant", tenant, "batch_size", batchSize, "rejected", results)
 		return
 	case len(resp.Results) == 0:
-		level.Debug(s.logger).Log("msg", "Stream accepted", "tenant", tenant, "batch_size", batchSize, "stream_idx", streamIdx)
+		level.Debug(s.logger).Log("msg", "Stream accepted", "tenant", tenant, "batch_size", batchSize)
 	}
 
 	// Send single stream to Kafka
-	s.sendStreamsToKafka(ctx, streamsBatch, tenant, errCh)
-	level.Debug(s.logger).Log("msg", "Sent streams to Kafka", "tenant", tenant, "batch_size", batchSize, "stream_idx", streamIdx)
+	s.sendStreamsToKafka(ctx, batch, tenant, errCh)
 }
 
-func (s *Generator) sendStreamsToKafka(ctx context.Context, streams []distributor.KeyedStream, tenant string, errCh chan error) {
+func (s *Generator) sendStreamsToKafka(ctx context.Context, streams []distributor.KeyedStream, tenant string, errCh chan<- error) {
 	for _, stream := range streams {
 		go func(stream distributor.KeyedStream) {
 			partitionID := int32(stream.HashKeyNoShard % uint64(s.cfg.NumPartitions))
diff --git a/tools/stream-generator/generator/metrics.go b/tools/stream-generator/generator/metrics.go
@@ -8,16 +8,21 @@ import (
 )
 
 type metrics struct {
-	activeStreamsTotal   *prometheus.CounterVec
-	kafkaWriteLatency    prometheus.Histogram
-	kafkaWriteBytesTotal prometheus.Counter
+	streamsCreatedTotal   *prometheus.CounterVec
+	streamsKeepAliveTotal *prometheus.CounterVec
+	kafkaWriteLatency     prometheus.Histogram
+	kafkaWriteBytesTotal  prometheus.Counter
 }
 
 func newMetrics(reg prometheus.Registerer) *metrics {
 	return &metrics{
-		activeStreamsTotal: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
-			Name: "active_streams_total",
-			Help: "The total number of active streams",
+		streamsCreatedTotal: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
+			Name: "streams_created_total",
+			Help: "The total number of streams create operations per tenant",
+		}, []string{"tenant"}),
+		streamsKeepAliveTotal: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
+			Name: "streams_keep_alive_total",
+			Help: "The total number of streams keep alive operations per tenant",
 		}, []string{"tenant"}),
 		kafkaWriteLatency: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
 			Name:                            "kafka_write_latency_seconds",
diff --git a/tools/stream-generator/generator/service.go b/tools/stream-generator/generator/service.go
@@ -42,6 +42,10 @@ type Generator struct {
 	// payload
 	streams map[string][]distributor.KeyedStream
 
+	// active streams
+	activeStreams    int
+	activeStreamsMtx sync.RWMutex
+
 	// kafka
 	writer *client.Producer
 
@@ -138,7 +142,7 @@ func (s *Generator) starting(ctx context.Context) error {
 	s.ctx, s.cancel = context.WithCancel(ctx)
 
 	// Calculate optimal QPS to match the desired rate
-	s.cfg.QPSPerTenant = calculateOptimalQPS(s.cfg.DesiredRate, s.cfg.BatchSize, s.logger)
+	s.cfg.QPSPerTenant = calculateOptimalQPS(s.cfg.DesiredRate, s.cfg.CreateBatchSize, s.logger)
 	level.Info(s.logger).Log("msg", fmt.Sprintf("Adjusted QPS per tenant to %d to match desired rate of %d bytes/s",
 		s.cfg.QPSPerTenant, s.cfg.DesiredRate))
 
@@ -161,53 +165,12 @@ func (s *Generator) running(ctx context.Context) error {
 	// Create error channel to collect errors from goroutines
 	errCh := make(chan error, s.cfg.NumTenants)
 
-	// Start a goroutine for each tenant
+	// Start goroutines for each tenant:
+	// - create: creates new streams in intervals
+	// - keepAlive: keeps existing streams alive by re-sending them to the backend
 	for tenant, streams := range s.streams {
-		s.wg.Add(1)
-		go func(tenant string, streams []distributor.KeyedStream) {
-			defer s.wg.Done()
-
-			// Create a ticker for rate limiting based on QPSPerTenant
-			ticker := time.NewTicker(s.cfg.BatchInterval)
-			defer ticker.Stop()
-
-			// Keep track of current stream index and whether we've completed first pass
-			streamIdx := 0
-			firstPassComplete := false
-
-			for {
-				select {
-				case <-ctx.Done():
-					return
-				case <-ticker.C:
-					if streamIdx >= len(streams) {
-						streamIdx = 0
-						firstPassComplete = true
-					}
-
-					batchSize := s.cfg.BatchSize
-					if streamIdx+batchSize > len(streams) {
-						batchSize = len(streams) - streamIdx
-					}
-
-					streamsBatch := streams[streamIdx : streamIdx+batchSize]
-
-					switch s.cfg.PushMode {
-					case PushStreamMetadataOnly:
-						s.sendStreamMetadata(ctx, streamsBatch, streamIdx, batchSize, tenant, errCh)
-					case PushStream:
-						s.sendStreams(ctx, streamsBatch, streamIdx, batchSize, tenant, errCh)
-					}
-
-					// Only increment during the first pass
-					if !firstPassComplete {
-						s.metrics.activeStreamsTotal.WithLabelValues(tenant).Add(float64(batchSize))
-					}
-
-					streamIdx += batchSize
-				}
-			}
-		}(tenant, streams)
+		go s.create(ctx, tenant, streams, errCh)
+		go s.keepAlive(ctx, tenant, streams, errCh)
 	}
 
 	// Wait for context cancellation, subservice failure, or tenant error
@@ -245,6 +208,81 @@ func (s *Generator) GetFrontendRing() *ring.Ring {
 	return s.frontendRing
 }
 
+func (s *Generator) create(ctx context.Context, tenant string, streams []distributor.KeyedStream, errCh chan<- error) {
+	s.wg.Add(1)
+	defer s.wg.Done()
+
+	createT := time.NewTicker(s.cfg.CreateNewStreamsInterval)
+	total := len(streams)
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-createT.C:
+			func() {
+				s.activeStreamsMtx.Lock()
+				defer s.activeStreamsMtx.Unlock()
+
+				if s.activeStreams >= total {
+					createT.Stop()
+					return
+				}
+
+				batchSize := s.cfg.CreateBatchSize
+				if s.activeStreams+batchSize > total {
+					batchSize = total - s.activeStreams
+				}
+
+				batch := streams[s.activeStreams : s.activeStreams+batchSize]
+				s.pushStreams(ctx, tenant, batch, errCh)
+
+				s.metrics.streamsCreatedTotal.WithLabelValues(tenant).Inc()
+				s.activeStreams += batchSize
+			}()
+		}
+	}
+}
+
+func (s *Generator) keepAlive(ctx context.Context, tenant string, streams []distributor.KeyedStream, errCh chan<- error) {
+	s.wg.Add(1)
+	defer s.wg.Done()
+
+	// Create a aliveT to create new streams in intervals
+	aliveT := time.NewTicker(time.Second / time.Duration(s.cfg.QPSPerTenant))
+	defer aliveT.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-aliveT.C:
+			func() {
+				s.activeStreamsMtx.RLock()
+				defer s.activeStreamsMtx.RUnlock()
+
+				if s.activeStreams < s.cfg.CreateBatchSize {
+					// Skip until first batch is created
+					return
+				}
+
+				batch := streams[:s.activeStreams-1]
+				s.pushStreams(ctx, tenant, batch, errCh)
+				s.metrics.streamsKeepAliveTotal.WithLabelValues(tenant).Inc()
+			}()
+		}
+	}
+}
+
+func (s *Generator) pushStreams(ctx context.Context, tenant string, streams []distributor.KeyedStream, errCh chan<- error) {
+	switch s.cfg.PushMode {
+	case PushStreamMetadataOnly:
+		s.sendStreamMetadata(ctx, tenant, streams, errCh)
+	case PushStream:
+		s.sendStreams(ctx, tenant, streams, errCh)
+	}
+}
+
 // calculateOptimalQPS calculates the optimal QPS to achieve the desired ingestion rate
 func calculateOptimalQPS(desiredRate, batchSize int, logger log.Logger) int {
 	// Calculate bytes per stream for normal streams