|
| 1 | +// Licensed to the .NET Foundation under one or more agreements. |
| 2 | +// The .NET Foundation licenses this file to you under the MIT license. |
| 3 | + |
| 4 | +#pragma warning disable CA2016 // Forward the 'CancellationToken' parameter to methods that take it. |
| 5 | +#pragma warning disable CS8618 // Non-nullable field must contain a non-null value when exiting constructor. |
| 6 | + |
| 7 | +using System; |
| 8 | +using System.Diagnostics.CodeAnalysis; |
| 9 | +using System.Linq; |
| 10 | +using System.Threading.Tasks; |
| 11 | +using Microsoft.Extensions.AI.Evaluation.NLP; |
| 12 | +using Microsoft.Extensions.AI.Evaluation.Reporting; |
| 13 | +using Microsoft.Extensions.AI.Evaluation.Reporting.Storage; |
| 14 | +using Microsoft.TestUtilities; |
| 15 | +using Xunit; |
| 16 | + |
| 17 | +namespace Microsoft.Extensions.AI.Evaluation.Integration.Tests; |
| 18 | + |
| 19 | +[Experimental("AIEVAL001")] |
| 20 | +public class NLPEvaluatorTests |
| 21 | +{ |
| 22 | + private static readonly ReportingConfiguration? _nlpReportingConfiguration; |
| 23 | + |
| 24 | + static NLPEvaluatorTests() |
| 25 | + { |
| 26 | + if (Settings.Current.Configured) |
| 27 | + { |
| 28 | + string version = $"Product Version: {Constants.Version}"; |
| 29 | + string date = $"Date: {DateTime.UtcNow:dddd, dd MMMM yyyy}"; |
| 30 | + string projectName = $"Project: Integration Tests"; |
| 31 | + string testClass = $"Test Class: {nameof(NLPEvaluatorTests)}"; |
| 32 | + string usesContext = $"Feature: Context"; |
| 33 | + |
| 34 | + IEvaluator bleuEvaluator = new BLEUEvaluator(); |
| 35 | + IEvaluator gleuEvaluator = new GLEUEvaluator(); |
| 36 | + IEvaluator f1Evaluator = new F1Evaluator(); |
| 37 | + |
| 38 | + _nlpReportingConfiguration = |
| 39 | + DiskBasedReportingConfiguration.Create( |
| 40 | + storageRootPath: Settings.Current.StorageRootPath, |
| 41 | + evaluators: [bleuEvaluator, gleuEvaluator, f1Evaluator], |
| 42 | + executionName: Constants.Version, |
| 43 | + tags: [version, date, projectName, testClass, usesContext]); |
| 44 | + } |
| 45 | + } |
| 46 | + |
| 47 | + [ConditionalFact] |
| 48 | + public async Task ExactMatch() |
| 49 | + { |
| 50 | + SkipIfNotConfigured(); |
| 51 | + |
| 52 | + await using ScenarioRun scenarioRun = |
| 53 | + await _nlpReportingConfiguration.CreateScenarioRunAsync( |
| 54 | + scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(NLPEvaluatorTests)}.{nameof(ExactMatch)}"); |
| 55 | + |
| 56 | + var referenceText = "The quick brown fox jumps over the lazy dog."; |
| 57 | + var bleuContext = new BLEUEvaluatorContext(referenceText); |
| 58 | + var gleuContext = new GLEUEvaluatorContext(referenceText); |
| 59 | + var f1Context = new F1EvaluatorContext(referenceText); |
| 60 | + |
| 61 | + EvaluationResult result = await scenarioRun.EvaluateAsync(referenceText, [bleuContext, gleuContext, f1Context]); |
| 62 | + |
| 63 | + Assert.False( |
| 64 | + result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), |
| 65 | + string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); |
| 66 | + |
| 67 | + Assert.Equal(3, result.Metrics.Count); |
| 68 | + Assert.True(result.TryGet(BLEUEvaluator.BLEUMetricName, out NumericMetric? _)); |
| 69 | + Assert.True(result.TryGet(GLEUEvaluator.GLEUMetricName, out NumericMetric? _)); |
| 70 | + Assert.True(result.TryGet(F1Evaluator.F1MetricName, out NumericMetric? _)); |
| 71 | + } |
| 72 | + |
| 73 | + [ConditionalFact] |
| 74 | + public async Task PartialMatch() |
| 75 | + { |
| 76 | + SkipIfNotConfigured(); |
| 77 | + |
| 78 | + await using ScenarioRun scenarioRun = |
| 79 | + await _nlpReportingConfiguration.CreateScenarioRunAsync( |
| 80 | + scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(NLPEvaluatorTests)}.{nameof(PartialMatch)}"); |
| 81 | + |
| 82 | + var referenceText = "The quick brown fox jumps over the lazy dog."; |
| 83 | + var bleuContext = new BLEUEvaluatorContext(referenceText); |
| 84 | + var gleuContext = new GLEUEvaluatorContext(referenceText); |
| 85 | + var f1Context = new F1EvaluatorContext(referenceText); |
| 86 | + |
| 87 | + var similarText = "The brown fox quickly jumps over a lazy dog."; |
| 88 | + EvaluationResult result = await scenarioRun.EvaluateAsync(similarText, [bleuContext, gleuContext, f1Context]); |
| 89 | + |
| 90 | + Assert.False( |
| 91 | + result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), |
| 92 | + string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); |
| 93 | + |
| 94 | + Assert.Equal(3, result.Metrics.Count); |
| 95 | + Assert.True(result.TryGet(BLEUEvaluator.BLEUMetricName, out NumericMetric? _)); |
| 96 | + Assert.True(result.TryGet(GLEUEvaluator.GLEUMetricName, out NumericMetric? _)); |
| 97 | + Assert.True(result.TryGet(F1Evaluator.F1MetricName, out NumericMetric? _)); |
| 98 | + } |
| 99 | + |
| 100 | + [ConditionalFact] |
| 101 | + public async Task Unmatched() |
| 102 | + { |
| 103 | + SkipIfNotConfigured(); |
| 104 | + |
| 105 | + await using ScenarioRun scenarioRun = |
| 106 | + await _nlpReportingConfiguration.CreateScenarioRunAsync( |
| 107 | + scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(NLPEvaluatorTests)}.{nameof(Unmatched)}"); |
| 108 | + |
| 109 | + var referenceText = "The quick brown fox jumps over the lazy dog."; |
| 110 | + var bleuContext = new BLEUEvaluatorContext(referenceText); |
| 111 | + var gleuContext = new GLEUEvaluatorContext(referenceText); |
| 112 | + var f1Context = new F1EvaluatorContext(referenceText); |
| 113 | + |
| 114 | + EvaluationResult result = await scenarioRun.EvaluateAsync("What is life's meaning?", [bleuContext, gleuContext, f1Context]); |
| 115 | + |
| 116 | + Assert.False( |
| 117 | + result.ContainsDiagnostics(d => d.Severity >= EvaluationDiagnosticSeverity.Warning), |
| 118 | + string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); |
| 119 | + |
| 120 | + Assert.Equal(3, result.Metrics.Count); |
| 121 | + Assert.True(result.TryGet(BLEUEvaluator.BLEUMetricName, out NumericMetric? _)); |
| 122 | + Assert.True(result.TryGet(GLEUEvaluator.GLEUMetricName, out NumericMetric? _)); |
| 123 | + Assert.True(result.TryGet(F1Evaluator.F1MetricName, out NumericMetric? _)); |
| 124 | + } |
| 125 | + |
| 126 | + [ConditionalFact] |
| 127 | + public async Task AdditionalContextIsNotPassed() |
| 128 | + { |
| 129 | + SkipIfNotConfigured(); |
| 130 | + |
| 131 | + await using ScenarioRun scenarioRun = |
| 132 | + await _nlpReportingConfiguration.CreateScenarioRunAsync( |
| 133 | + scenarioName: $"Microsoft.Extensions.AI.Evaluation.Integration.Tests.{nameof(NLPEvaluatorTests)}.{nameof(AdditionalContextIsNotPassed)}"); |
| 134 | + |
| 135 | + EvaluationResult result = await scenarioRun.EvaluateAsync("What is the meaning of life?"); |
| 136 | + |
| 137 | + Assert.True( |
| 138 | + result.Metrics.Values.All(m => m.ContainsDiagnostics(d => d.Severity is EvaluationDiagnosticSeverity.Error)), |
| 139 | + string.Join("\r\n\r\n", result.Metrics.Values.SelectMany(m => m.Diagnostics ?? []).Select(d => d.ToString()))); |
| 140 | + |
| 141 | + Assert.Equal(3, result.Metrics.Count); |
| 142 | + Assert.True(result.TryGet(BLEUEvaluator.BLEUMetricName, out NumericMetric? bleu)); |
| 143 | + Assert.True(result.TryGet(GLEUEvaluator.GLEUMetricName, out NumericMetric? gleu)); |
| 144 | + Assert.True(result.TryGet(F1Evaluator.F1MetricName, out NumericMetric? f1)); |
| 145 | + |
| 146 | + Assert.Null(bleu.Context); |
| 147 | + Assert.Null(gleu.Context); |
| 148 | + Assert.Null(f1.Context); |
| 149 | + |
| 150 | + } |
| 151 | + |
| 152 | + [MemberNotNull(nameof(_nlpReportingConfiguration))] |
| 153 | + private static void SkipIfNotConfigured() |
| 154 | + { |
| 155 | + if (!Settings.Current.Configured) |
| 156 | + { |
| 157 | + throw new SkipTestException("Test is not configured"); |
| 158 | + } |
| 159 | + |
| 160 | + Assert.NotNull(_nlpReportingConfiguration); |
| 161 | + } |
| 162 | +} |
0 commit comments