-
Notifications
You must be signed in to change notification settings - Fork 3.8k
feat: check for failed reason in distributors #18128
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9c11c0b
e7b1195
31dad80
61406e2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -718,25 +718,6 @@ func (d *Distributor) PushWithResolver(ctx context.Context, req *logproto.PushRe | |
return &logproto.PushResponse{}, validationErr | ||
} | ||
|
||
if d.cfg.IngestLimitsEnabled { | ||
streamsAfterLimits, reasonsForHashes, err := d.ingestLimits.enforceLimits(ctx, tenantID, streams) | ||
if err != nil { | ||
level.Error(d.logger).Log("msg", "failed to check if request exceeds limits, request has been accepted", "err", err) | ||
} else if len(streamsAfterLimits) == 0 { | ||
// All streams have been dropped. | ||
level.Debug(d.logger).Log("msg", "request exceeded limits, all streams will be dropped", "tenant", tenantID) | ||
if !d.cfg.IngestLimitsDryRunEnabled { | ||
return nil, httpgrpc.Error(http.StatusTooManyRequests, "request exceeded limits: "+firstReasonForHashes(reasonsForHashes)) | ||
} | ||
} else if len(streamsAfterLimits) < len(streams) { | ||
// Some streams have been dropped. | ||
level.Debug(d.logger).Log("msg", "request exceeded limits, some streams will be dropped", "tenant", tenantID) | ||
if !d.cfg.IngestLimitsDryRunEnabled { | ||
streams = streamsAfterLimits | ||
} | ||
} | ||
} | ||
|
||
if !d.ingestionRateLimiter.AllowN(now, tenantID, validationContext.validationMetrics.aggregatedPushStats.lineSize) { | ||
d.trackDiscardedData(ctx, req, validationContext, tenantID, validationContext.validationMetrics, validation.RateLimited, streamResolver) | ||
|
||
|
@@ -746,6 +727,19 @@ func (d *Distributor) PushWithResolver(ctx context.Context, req *logproto.PushRe | |
return nil, httpgrpc.Errorf(http.StatusTooManyRequests, "%s", err.Error()) | ||
} | ||
|
||
// These limits are checked after the ingestion rate limit as this | ||
// is how it works in ingesters. | ||
if d.cfg.IngestLimitsEnabled { | ||
accepted, err := d.ingestLimits.EnforceLimits(ctx, tenantID, streams) | ||
if err == nil && !d.cfg.IngestLimitsDryRunEnabled { | ||
if len(accepted) == 0 { | ||
// All streams were rejected, the request should be failed. | ||
return nil, httpgrpc.Error(http.StatusTooManyRequests, "request exceeded limits") | ||
} | ||
streams = accepted | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we "shink" the streams slice to the accepted ones, I believe the next check of the ingestionRateLimiter is operating on the wrong calculated value Imagine accepting a subset of the incoming streams but rate limiting on the total incoming. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will think about how to solve this 😢 We had the same behavior before this PR where we are rate limiting on discarded streams over the stream limit, so I think let's keep this PR scoped to the feature and then I will open a second PR to address this problem. |
||
} | ||
} | ||
|
||
// Nil check for performance reasons, to avoid dynamic lookup and/or no-op | ||
// function calls that cannot be inlined. | ||
if d.tee != nil { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2462,7 +2462,7 @@ func TestDistributor_PushIngestLimits(t *testing.T) { | |
Reason: uint32(limits.ReasonMaxStreams), | ||
}}, | ||
}, | ||
expectedErr: "rpc error: code = Code(429) desc = request exceeded limits: max streams", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I got rid of the reason for now, I'm not sure if its useful as it doesn't mention which streams. I want to think about how to better communicate this data back to the user given that some requests can be really large. |
||
expectedErr: "rpc error: code = Code(429) desc = request exceeded limits", | ||
}, { | ||
name: "one of two streams exceed max stream limit, request is accepted", | ||
ingestLimitsEnabled: true, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,7 +18,7 @@ import ( | |
|
||
// ingestLimitsFrontendClient is used for tests. | ||
type ingestLimitsFrontendClient interface { | ||
exceedsLimits(context.Context, *proto.ExceedsLimitsRequest) (*proto.ExceedsLimitsResponse, error) | ||
ExceedsLimits(context.Context, *proto.ExceedsLimitsRequest) (*proto.ExceedsLimitsResponse, error) | ||
} | ||
|
||
// ingestLimitsFrontendRingClient uses the ring to query ingest-limits frontends. | ||
|
@@ -35,7 +35,7 @@ func newIngestLimitsFrontendRingClient(ring ring.ReadRing, pool *ring_client.Poo | |
} | ||
|
||
// Implements the ingestLimitsFrontendClient interface. | ||
func (c *ingestLimitsFrontendRingClient) exceedsLimits(ctx context.Context, req *proto.ExceedsLimitsRequest) (*proto.ExceedsLimitsResponse, error) { | ||
func (c *ingestLimitsFrontendRingClient) ExceedsLimits(ctx context.Context, req *proto.ExceedsLimitsRequest) (*proto.ExceedsLimitsResponse, error) { | ||
// We use an FNV-1 of all stream hashes in the request to load balance requests | ||
// to limits-frontends instances. | ||
h := fnv.New32() | ||
|
@@ -78,64 +78,85 @@ func (c *ingestLimitsFrontendRingClient) exceedsLimits(ctx context.Context, req | |
|
||
type ingestLimits struct { | ||
client ingestLimitsFrontendClient | ||
limitsFailures prometheus.Counter | ||
requests prometheus.Counter | ||
requestsFailed prometheus.Counter | ||
} | ||
|
||
func newIngestLimits(client ingestLimitsFrontendClient, r prometheus.Registerer) *ingestLimits { | ||
return &ingestLimits{ | ||
client: client, | ||
limitsFailures: promauto.With(r).NewCounter(prometheus.CounterOpts{ | ||
Name: "loki_distributor_ingest_limits_failures_total", | ||
Help: "The total number of failures checking ingest limits.", | ||
requests: promauto.With(r).NewCounter(prometheus.CounterOpts{ | ||
Name: "loki_distributor_ingest_limits_requests_total", | ||
Help: "The total number of requests.", | ||
}), | ||
requestsFailed: promauto.With(r).NewCounter(prometheus.CounterOpts{ | ||
Name: "loki_distributor_ingest_limits_requests_failed_total", | ||
Help: "The total number of requests that failed.", | ||
}), | ||
} | ||
} | ||
|
||
// enforceLimits returns a slice of streams that are within the per-tenant | ||
// limits, and in the case where one or more streams exceed per-tenant | ||
// limits, the reasons those streams were not included in the result. | ||
// An error is returned if per-tenant limits could not be enforced. | ||
func (l *ingestLimits) enforceLimits(ctx context.Context, tenant string, streams []KeyedStream) ([]KeyedStream, map[uint64][]string, error) { | ||
exceedsLimits, reasons, err := l.exceedsLimits(ctx, tenant, streams) | ||
if !exceedsLimits || err != nil { | ||
return streams, nil, err | ||
// EnforceLimits checks all streams against the per-tenant limits and returns | ||
// a slice containing the streams that are accepted (within the per-tenant | ||
// limits). Any streams that could not have their limits checked are also | ||
// accepted. | ||
func (l *ingestLimits) EnforceLimits(ctx context.Context, tenant string, streams []KeyedStream) ([]KeyedStream, error) { | ||
results, err := l.ExceedsLimits(ctx, tenant, streams) | ||
if err != nil { | ||
return streams, err | ||
} | ||
// Fast path. No results means all streams were accepted and there were | ||
// no failures, so we can return the input streams. | ||
if len(results) == 0 { | ||
return streams, nil | ||
} | ||
// We can do this without allocation if needed, but doing so will modify | ||
// the original backing array. See "Filtering without allocation" from | ||
// https://go.dev/wiki/SliceTricks. | ||
withinLimits := make([]KeyedStream, 0, len(streams)) | ||
accepted := make([]KeyedStream, 0, len(streams)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we need to consider doing the "filtering without allocation" trick here if other validation stats are in following updated by this. OTH the backing array includes shards and not streams right, so the trick might not be possible at all. WDYT? |
||
for _, s := range streams { | ||
if _, ok := reasons[s.HashKeyNoShard]; !ok { | ||
withinLimits = append(withinLimits, s) | ||
// Check each stream to see if it failed. | ||
// TODO(grobinson): We have an O(N*M) loop here. Need to benchmark if | ||
// its faster to do this or if we should create a map instead. | ||
var ( | ||
found bool | ||
reason uint32 | ||
) | ||
for _, res := range results { | ||
if res.StreamHash == s.HashKeyNoShard { | ||
found = true | ||
reason = res.Reason | ||
break | ||
} | ||
} | ||
Comment on lines
+119
to
+131
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should measure this adds any latency in case of ingest-limits degradation. |
||
if !found || reason == uint32(limits.ReasonFailed) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we stopped using |
||
accepted = append(accepted, s) | ||
} | ||
} | ||
return withinLimits, reasons, nil | ||
return accepted, nil | ||
} | ||
|
||
// ExceedsLimits returns true if one or more streams exceeds per-tenant limits, | ||
// and false if no streams exceed per-tenant limits. In the case where one or | ||
// more streams exceeds per-tenant limits, it returns the reasons for each stream. | ||
// An error is returned if per-tenant limits could not be checked. | ||
func (l *ingestLimits) exceedsLimits(ctx context.Context, tenant string, streams []KeyedStream) (bool, map[uint64][]string, error) { | ||
// ExceedsLimits checks all streams against the per-tenant limits. It returns | ||
// an error if the client failed to send the request or receive a response | ||
// from the server. Any streams that could not have their limits checked | ||
// and returned in the results with the reason "ReasonFailed". | ||
func (l *ingestLimits) ExceedsLimits( | ||
ctx context.Context, | ||
tenant string, | ||
streams []KeyedStream, | ||
) ([]*proto.ExceedsLimitsResult, error) { | ||
l.requests.Inc() | ||
req, err := newExceedsLimitsRequest(tenant, streams) | ||
if err != nil { | ||
return false, nil, err | ||
l.requestsFailed.Inc() | ||
return nil, err | ||
} | ||
resp, err := l.client.exceedsLimits(ctx, req) | ||
resp, err := l.client.ExceedsLimits(ctx, req) | ||
if err != nil { | ||
return false, nil, err | ||
} | ||
if len(resp.Results) == 0 { | ||
return false, nil, nil | ||
l.requestsFailed.Inc() | ||
return nil, err | ||
} | ||
reasonsForHashes := make(map[uint64][]string) | ||
for _, result := range resp.Results { | ||
reasons := reasonsForHashes[result.StreamHash] | ||
humanized := limits.Reason(result.Reason).String() | ||
reasons = append(reasons, humanized) | ||
reasonsForHashes[result.StreamHash] = reasons | ||
} | ||
return true, reasonsForHashes, nil | ||
return resp.Results, nil | ||
} | ||
|
||
func newExceedsLimitsRequest(tenant string, streams []KeyedStream) (*proto.ExceedsLimitsRequest, error) { | ||
|
@@ -156,10 +177,3 @@ func newExceedsLimitsRequest(tenant string, streams []KeyedStream) (*proto.Excee | |
Streams: streamMetadata, | ||
}, nil | ||
} | ||
|
||
func firstReasonForHashes(reasonsForHashes map[uint64][]string) string { | ||
for _, reasons := range reasonsForHashes { | ||
return reasons[0] | ||
} | ||
return "unknown reason" | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Got rid of these log lines, will be far too much volume.