Skip to content

Commit 93b8616

Browse files
peterwaldstephentoub
authored andcommitted
Add reporting tests that show NLP results. (#6574)
* Add reporting tests that show NLP results. * Cleanup analyzer errors. * Add global tags for NLP * Add more precision to the evaluator timing * More tags * Add another partial match test
1 parent 1a35757 commit 93b8616

File tree

6 files changed

+167
-4
lines changed

6 files changed

+167
-4
lines changed

src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/BLEUEvaluator.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ public ValueTask<EvaluationResult> EvaluateAsync(
8686
});
8787

8888
metric.Value = score;
89-
string durationText = $"{duration.TotalSeconds.ToString("F2", CultureInfo.InvariantCulture)} s";
89+
string durationText = $"{duration.TotalSeconds.ToString("F4", CultureInfo.InvariantCulture)} s";
9090
metric.AddOrUpdateMetadata(name: "evaluation-duration", value: durationText);
9191
metric.AddOrUpdateContext(context);
9292
metric.Interpretation = metric.Interpret();

src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/F1Evaluator.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ public ValueTask<EvaluationResult> EvaluateAsync(
7777
});
7878

7979
metric.Value = score;
80-
string durationText = $"{duration.TotalSeconds.ToString("F2", CultureInfo.InvariantCulture)} s";
80+
string durationText = $"{duration.TotalSeconds.ToString("F4", CultureInfo.InvariantCulture)} s";
8181
metric.AddOrUpdateMetadata(name: "evaluation-duration", value: durationText);
8282
metric.AddOrUpdateContext(context);
8383
metric.Interpretation = metric.Interpret();

src/Libraries/Microsoft.Extensions.AI.Evaluation.NLP/GLEUEvaluator.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ public ValueTask<EvaluationResult> EvaluateAsync(
8686
});
8787

8888
metric.Value = score;
89-
string durationText = $"{duration.TotalSeconds.ToString("F2", CultureInfo.InvariantCulture)} s";
89+
string durationText = $"{duration.TotalSeconds.ToString("F4", CultureInfo.InvariantCulture)} s";
9090
metric.AddOrUpdateMetadata(name: "evaluation-duration", value: durationText);
9191
metric.AddOrUpdateContext(context);
9292
metric.Interpretation = metric.Interpret();

src/Libraries/Microsoft.Extensions.AI.Evaluation/EvaluationMetricExtensions.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ public static void AddOrUpdateChatMetadata(
177177

178178
if (duration is not null)
179179
{
180-
string durationText = $"{duration.Value.TotalSeconds.ToString("F2", CultureInfo.InvariantCulture)} s";
180+
string durationText = $"{duration.Value.TotalSeconds.ToString("F4", CultureInfo.InvariantCulture)} s";
181181
metric.AddOrUpdateMetadata(name: "evaluation-duration", value: durationText);
182182
}
183183
}

test/Libraries/Microsoft.Extensions.AI.Evaluation.Integration.Tests/Microsoft.Extensions.AI.Evaluation.Integration.Tests.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
</ItemGroup>
2929

3030
<ItemGroup>
31+
<ProjectReference Include="..\..\..\src\Libraries\Microsoft.Extensions.AI.Evaluation.NLP\Microsoft.Extensions.AI.Evaluation.NLP.csproj" />
3132
<ProjectReference Include="..\..\..\src\Libraries\Microsoft.Extensions.AI.OpenAI\Microsoft.Extensions.AI.OpenAI.csproj" />
3233
<ProjectReference Include="..\..\..\src\Libraries\Microsoft.Extensions.AI.Evaluation\Microsoft.Extensions.AI.Evaluation.csproj" />
3334
<ProjectReference Include="..\..\..\src\Libraries\Microsoft.Extensions.AI.Evaluation.Quality\Microsoft.Extensions.AI.Evaluation.Quality.csproj" />
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
4+
#pragma warning disable CA2016 // Forward the 'CancellationToken' parameter to methods that take it.
5+
#pragma warning disable CS8618 // Non-nullable field must contain a non-null value when exiting constructor.
6+
7+
using System;
8+
using System.Diagnostics.CodeAnalysis;
9+
using System.Linq;
10+
using System.Threading.Tasks;
11+
using Microsoft.Extensions.AI.Evaluation.NLP;
12+
using Microsoft.Extensions.AI.Evaluation.Reporting;
13+
using Microsoft.Extensions.AI.Evaluation.Reporting.Storage;
14+
using Microsoft.TestUtilities;
15+
using Xunit;
16+
17+
namespace Microsoft.Extensions.AI.Evaluation.Integration.Tests;
18+
19+
[Experimental("AIEVAL001")]
20+
public class NLPEvaluatorTests
21+
{
22+
private static readonly ReportingConfiguration? _nlpReportingConfiguration;
23+
24+
static NLPEvaluatorTests()
25+
{
26+
if (Settings.Current.Configured)
27+
{
28+
string version = $"Product Version: {Constants.Version}";
29+
string date = $"Date: {DateTime.UtcNow:dddd, dd MMMM yyyy}";
30+
string projectName = $"Project: Integration Tests";
31+
string testClass = $"Test Class: {nameof(NLPEvaluatorTests)}";
32+
string usesContext = $"Feature: Context";
33+
34+
IEvaluator bleuEvaluator = new BLEUEvaluator();
35+
IEvaluator gleuEvaluator = new GLEUEvaluator();
36+
IEvaluator f1Evaluator = new F1Evaluator();
37+
38+
_nlpReportingConfiguration =
39+
DiskBasedReportingConfiguration.Create(
40+
storageRootPath: Settings.Current.StorageRootPath,
41+
evaluators: [bleuEvaluator, gleuEvaluator, f1Evaluator],
42+
executionName: Constants.Version,
43+
tags: [version, date, projectName, testClass, usesContext]);
44+
}
45+
}
46+
47+
[ConditionalFact]
48+
public async Task ExactMatch()
49+
{
50+
SkipIfNotConfigured();
51+
52+
await using ScenarioRun scenarioRun =
53+
await _nlpReportingConfiguration.CreateScenarioRunAsync(
54+
scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(NLPEvaluatorTests)}.{nameof(ExactMatch)}");
55+
56+
var referenceText = "The quick brown fox jumps over the lazy dog.";
57+
var bleuContext = new BLEUEvaluatorContext(referenceText);
58+
var gleuContext = new GLEUEvaluatorContext(referenceText);
59+
var f1Context = new F1EvaluatorContext(referenceText);
60+
61+
EvaluationResult result = await scenarioRun.EvaluateAsync(referenceText, [bleuContext, gleuContext, f1Context]);
62+
63+
Assert.False(
64+
result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning),
65+
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));
66+
67+
Assert.Equal(3, result.Metrics.Count);
68+
Assert.True(result.TryGet(BLEUEvaluator.BLEUMetricName, out NumericMetric? _));
69+
Assert.True(result.TryGet(GLEUEvaluator.GLEUMetricName, out NumericMetric? _));
70+
Assert.True(result.TryGet(F1Evaluator.F1MetricName, out NumericMetric? _));
71+
}
72+
73+
[ConditionalFact]
74+
public async Task PartialMatch()
75+
{
76+
SkipIfNotConfigured();
77+
78+
await using ScenarioRun scenarioRun =
79+
await _nlpReportingConfiguration.CreateScenarioRunAsync(
80+
scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(NLPEvaluatorTests)}.{nameof(PartialMatch)}");
81+
82+
var referenceText = "The quick brown fox jumps over the lazy dog.";
83+
var bleuContext = new BLEUEvaluatorContext(referenceText);
84+
var gleuContext = new GLEUEvaluatorContext(referenceText);
85+
var f1Context = new F1EvaluatorContext(referenceText);
86+
87+
var similarText = "The brown fox quickly jumps over a lazy dog.";
88+
EvaluationResult result = await scenarioRun.EvaluateAsync(similarText, [bleuContext, gleuContext, f1Context]);
89+
90+
Assert.False(
91+
result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning),
92+
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));
93+
94+
Assert.Equal(3, result.Metrics.Count);
95+
Assert.True(result.TryGet(BLEUEvaluator.BLEUMetricName, out NumericMetric? _));
96+
Assert.True(result.TryGet(GLEUEvaluator.GLEUMetricName, out NumericMetric? _));
97+
Assert.True(result.TryGet(F1Evaluator.F1MetricName, out NumericMetric? _));
98+
}
99+
100+
[ConditionalFact]
101+
public async Task Unmatched()
102+
{
103+
SkipIfNotConfigured();
104+
105+
await using ScenarioRun scenarioRun =
106+
await _nlpReportingConfiguration.CreateScenarioRunAsync(
107+
scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(NLPEvaluatorTests)}.{nameof(Unmatched)}");
108+
109+
var referenceText = "The quick brown fox jumps over the lazy dog.";
110+
var bleuContext = new BLEUEvaluatorContext(referenceText);
111+
var gleuContext = new GLEUEvaluatorContext(referenceText);
112+
var f1Context = new F1EvaluatorContext(referenceText);
113+
114+
EvaluationResult result = await scenarioRun.EvaluateAsync("What is life's meaning?", [bleuContext, gleuContext, f1Context]);
115+
116+
Assert.False(
117+
result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning),
118+
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));
119+
120+
Assert.Equal(3, result.Metrics.Count);
121+
Assert.True(result.TryGet(BLEUEvaluator.BLEUMetricName, out NumericMetric? _));
122+
Assert.True(result.TryGet(GLEUEvaluator.GLEUMetricName, out NumericMetric? _));
123+
Assert.True(result.TryGet(F1Evaluator.F1MetricName, out NumericMetric? _));
124+
}
125+
126+
[ConditionalFact]
127+
public async Task AdditionalContextIsNotPassed()
128+
{
129+
SkipIfNotConfigured();
130+
131+
await using ScenarioRun scenarioRun =
132+
await _nlpReportingConfiguration.CreateScenarioRunAsync(
133+
scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(NLPEvaluatorTests)}.{nameof(AdditionalContextIsNotPassed)}");
134+
135+
EvaluationResult result = await scenarioRun.EvaluateAsync("What is the meaning of life?");
136+
137+
Assert.True(
138+
result.Metrics.Values.All(m => m.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error)),
139+
string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString())));
140+
141+
Assert.Equal(3, result.Metrics.Count);
142+
Assert.True(result.TryGet(BLEUEvaluator.BLEUMetricName, out NumericMetric? bleu));
143+
Assert.True(result.TryGet(GLEUEvaluator.GLEUMetricName, out NumericMetric? gleu));
144+
Assert.True(result.TryGet(F1Evaluator.F1MetricName, out NumericMetric? f1));
145+
146+
Assert.Null(bleu.Context);
147+
Assert.Null(gleu.Context);
148+
Assert.Null(f1.Context);
149+
150+
}
151+
152+
[MemberNotNull(nameof(_nlpReportingConfiguration))]
153+
private static void SkipIfNotConfigured()
154+
{
155+
if (!Settings.Current.Configured)
156+
{
157+
throw new SkipTestException("Test is not configured");
158+
}
159+
160+
Assert.NotNull(_nlpReportingConfiguration);
161+
}
162+
}

0 commit comments

Comments
 (0)