64
64
Help : "The total number of streams with exporter=OTLP label" ,
65
65
}, []string {"tenant" })
66
66
67
+ distributorLagByUserAgent = promauto .NewCounterVec (prometheus.CounterOpts {
68
+ Namespace : constants .Loki ,
69
+ Name : "distributor_most_recent_lag_ms" ,
70
+ Help : "The difference in time (in millis) between when a distributor receives a push request and the most recent log timestamp in that request" ,
71
+ }, []string {"tenant" , "userAgent" })
72
+
67
73
bytesReceivedStats = analytics .NewCounter ("distributor_bytes_received" )
68
74
structuredMetadataBytesReceivedStats = analytics .NewCounter ("distributor_structured_metadata_bytes_received" )
69
75
linesReceivedStats = analytics .NewCounter ("distributor_lines_received" )
@@ -221,6 +227,7 @@ func ParseRequest(logger log.Logger, userID string, maxRecvMsgSize int, r *http.
221
227
totalNumLines += numLines
222
228
}
223
229
linesReceivedStats .Inc (totalNumLines )
230
+ mostRecentLagMs := time .Since (pushStats .MostRecentEntryTimestamp ).Milliseconds ()
224
231
225
232
logValues := []interface {}{
226
233
"msg" , "push request parsed" ,
@@ -234,7 +241,7 @@ func ParseRequest(logger log.Logger, userID string, maxRecvMsgSize int, r *http.
234
241
"entriesSize" , humanize .Bytes (uint64 (entriesSize )),
235
242
"structuredMetadataSize" , humanize .Bytes (uint64 (structuredMetadataSize )),
236
243
"totalSize" , humanize .Bytes (uint64 (entriesSize + pushStats .StreamLabelsSize )),
237
- "mostRecentLagMs" , time . Since ( pushStats . MostRecentEntryTimestamp ). Milliseconds () ,
244
+ "mostRecentLagMs" , mostRecentLagMs ,
238
245
}
239
246
240
247
if presumedAgentIP != "" {
@@ -245,6 +252,18 @@ func ParseRequest(logger log.Logger, userID string, maxRecvMsgSize int, r *http.
245
252
if userAgent != "" {
246
253
logValues = append (logValues , "userAgent" , strings .TrimSpace (userAgent ))
247
254
}
255
+ // Since we're using a counter (so we can do things w/rate, irate, deriv, etc.) on the lag metrics,
256
+ // dispatch a warning if we ever get a negative value. This could occur if we start getting logs
257
+ // whose timestamps are in the future (e.g. agents sending logs w/missing or invalid NTP configs).
258
+ // Negative values can't give us much insight into whether-or-not a customer's ingestion is falling
259
+ // behind, so we won't include it in the metrics, and instead will capture the occurrence in the
260
+ // distributor logs.
261
+ // We capture this metric even when the user agent is empty; we want insight into the tenant's
262
+ // ingestion lag no matter what.
263
+ if mostRecentLagMs >= 0 && mostRecentLagMs < 1_000_000_000 {
264
+ // we're filtering out anything over 1B -- the OTLP endpoints often really mess with this metric...
265
+ distributorLagByUserAgent .WithLabelValues (userID , userAgent ).Add (float64 (mostRecentLagMs ))
266
+ }
248
267
249
268
if tenantConfigs != nil && tenantConfigs .LogHashOfLabels (userID ) {
250
269
resultHash := uint64 (0 )
0 commit comments