feat: GenAI Evaluation: Release GenAI Evaluation SDK multimodal evaluation to vertexai.preview module.

vertex-sdk-bot · copybara-github · commit f090ca1f7538 · 2025-03-28T15:50:34.000-07:00
PiperOrigin-RevId: 741668994
diff --git a/tests/unit/vertexai/test_evaluation.py b/tests/unit/vertexai/test_evaluation.py
@@ -36,6 +36,7 @@
 from google.cloud.aiplatform_v1beta1.services import (
     evaluation_service as gapic_evaluation_services_preview,
 )
+from google.cloud.aiplatform_v1beta1.types import content
 from google.cloud.aiplatform_v1beta1.types import (
     evaluation_service as gapic_evaluation_service_types_preview,
 )
@@ -70,6 +71,10 @@
 PairwisePreview = (
     evaluation_preview.metrics.metric_prompt_template_examples.MetricPromptTemplateExamples.Pairwise
 )
+ContentMap = gapic_evaluation_service_types_preview.ContentMap
+Content = content.Content
+Part = content.Part
+
 
 _TEST_PROJECT = "test-project"
 _TEST_LOCATION = "us-central1"
diff --git a/tests/unit/vertexai/test_multimodal_utils.py b/tests/unit/vertexai/test_multimodal_utils.py
@@ -0,0 +1,104 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Unit tests for multimodal utils."""
+
+from google.cloud.aiplatform_v1beta1.types import content
+from google.cloud.aiplatform_v1beta1.types import (
+    evaluation_service as gapic_eval_service_types,
+)
+from vertexai.preview.evaluation import (
+    multimodal_utils,
+)
+
+
+ContentMap = gapic_eval_service_types.ContentMap
+Content = content.Content
+Part = content.Part
+
+_TEST_PROJECT = "test-project"
+_TEST_LOCATION = "us-central1"
+
+_MODEL_BASED_METRIC_INSTANCE_INPUT = {
+    "prompt": '{"contents": [{"parts": [{"text": "test prompt"}]}]}',
+    "response": (
+        '{"contents": [{"parts": [{"file_data": {"mime_type": "image/png",'
+        ' "file_uri": "gs://test-bucket/image1.png"}}]}]}'
+    ),
+    "baseline_response": (
+        '{"contents": [{"parts": [{"file_data": {"mime_type": "image/jepg",'
+        ' "file_uri": "gs://test-bucket/image2.png"}}]}]}'
+    ),
+}
+_INVALID_MODEL_BASED_METRIC_INSTANCE_INPUT = {
+    "prompt": "test prompt",
+    "invalid_response_format": (
+        '{"contents": [{{{{"parts": [{"file_data": {"mime_type": "image/png",'
+        ' "file_uri": "gs://test-bucket/image1.png"}}]}]}'
+    ),
+    "baseline_response": "test image",
+}
+
+
+class TestMultimodalUtils:
+    """Unit tests for multimodal utils."""
+
+    def test_is_multimodal_instance(self):
+        assert multimodal_utils.is_multimodal_instance(
+            _MODEL_BASED_METRIC_INSTANCE_INPUT
+        )
+
+    def test_not_multimodal_instance(self):
+        assert not multimodal_utils.is_multimodal_instance(
+            _INVALID_MODEL_BASED_METRIC_INSTANCE_INPUT
+        )
+
+    def test_convert_multimodal_response_to_content_map(self):
+        """Test convert_multimodal_response_to_content_map."""
+        content_map = multimodal_utils.convert_multimodal_response_to_content_map(
+            _MODEL_BASED_METRIC_INSTANCE_INPUT
+        )
+        assert content_map.values["prompt"] == ContentMap.Contents(
+            contents=[Content(parts=[Part(text="test prompt")])]
+        )
+        assert content_map.values["response"] == ContentMap.Contents(
+            contents=[
+                Content(
+                    parts=[
+                        Part(
+                            file_data={
+                                "mime_type": "image/png",
+                                "file_uri": "gs://test-bucket/image1.png",
+                            }
+                        )
+                    ]
+                )
+            ]
+        )
+        assert content_map.values["baseline_response"] == ContentMap.Contents(
+            contents=[
+                Content(
+                    parts=[
+                        Part(
+                            file_data={
+                                "mime_type": "image/jepg",
+                                "file_uri": "gs://test-bucket/image2.png",
+                            }
+                        )
+                    ]
+                )
+            ]
+        )
diff --git a/vertexai/preview/evaluation/metrics/_instance_evaluation.py b/vertexai/preview/evaluation/metrics/_instance_evaluation.py
@@ -30,6 +30,9 @@
 )
 from vertexai.preview.evaluation import _base as eval_base
 from vertexai.preview.evaluation import constants
+from vertexai.preview.evaluation import (
+    multimodal_utils,
+)
 from vertexai.preview.evaluation import (
     prompt_template as prompt_template_base,
 )
@@ -46,7 +49,6 @@
 
 from google.protobuf import json_format
 
-
 _LOGGER = base.Logger(__name__)
 _METRIC_NAME_TO_METRIC_SPEC = {
     # Automatic Metrics.
@@ -317,24 +319,44 @@ def build_request(
             tool_parameter_kv_match_input=instance,
         )
     elif metric_name == constants.Metric.POINTWISE_METRIC:
-        instance = gapic_eval_service_types.PointwiseMetricInput(
-            metric_spec=metric_spec,
-            instance=gapic_eval_service_types.PointwiseMetricInstance(
-                json_instance=json.dumps(model_based_metric_instance_input),
-            ),
-        )
+        if multimodal_utils.is_multimodal_instance(model_based_metric_instance_input):
+            instance = gapic_eval_service_types.PointwiseMetricInput(
+                metric_spec=metric_spec,
+                instance=gapic_eval_service_types.PointwiseMetricInstance(
+                    content_map_instance=multimodal_utils.convert_multimodal_response_to_content_map(
+                        model_based_metric_instance_input
+                    ),
+                ),
+            )
+        else:
+            instance = gapic_eval_service_types.PointwiseMetricInput(
+                metric_spec=metric_spec,
+                instance=gapic_eval_service_types.PointwiseMetricInstance(
+                    json_instance=json.dumps(model_based_metric_instance_input),
+                ),
+            )
         return gapic_eval_service_types.EvaluateInstancesRequest(
             location=location_path,
             pointwise_metric_input=instance,
             autorater_config=evaluation_run_config.autorater_config,
         )
     elif metric_name == constants.Metric.PAIRWISE_METRIC:
-        instance = gapic_eval_service_types.PairwiseMetricInput(
-            metric_spec=metric_spec,
-            instance=gapic_eval_service_types.PairwiseMetricInstance(
-                json_instance=json.dumps(model_based_metric_instance_input),
-            ),
-        )
+        if multimodal_utils.is_multimodal_instance(model_based_metric_instance_input):
+            instance = gapic_eval_service_types.PairwiseMetricInput(
+                metric_spec=metric_spec,
+                instance=gapic_eval_service_types.PairwiseMetricInstance(
+                    content_map_instance=multimodal_utils.convert_multimodal_response_to_content_map(
+                        model_based_metric_instance_input
+                    ),
+                ),
+            )
+        else:
+            instance = gapic_eval_service_types.PairwiseMetricInput(
+                metric_spec=metric_spec,
+                instance=gapic_eval_service_types.PairwiseMetricInstance(
+                    json_instance=json.dumps(model_based_metric_instance_input),
+                ),
+            )
         return gapic_eval_service_types.EvaluateInstancesRequest(
             location=location_path,
             pairwise_metric_input=instance,
diff --git a/vertexai/preview/evaluation/multimodal_utils.py b/vertexai/preview/evaluation/multimodal_utils.py
@@ -0,0 +1,82 @@
+"""Utility functions for multimodal evaluation."""
+
+import logging
+from typing import Dict
+
+from google.cloud.aiplatform_v1beta1.types import content
+from google.cloud.aiplatform_v1beta1.types import (
+    evaluation_service as gapic_eval_service_types,
+)
+from google.protobuf import json_format
+
+
+ContentMap = gapic_eval_service_types.ContentMap
+Content = content.Content
+Part = content.Part
+_CONTENTS_DETECTOR = "contents {"
+_PARTS_DETECTOR = "parts {"
+
+
+def _string_to_content_list(input_str: str) -> ContentMap.Contents:
+    """Converts a string to a list if possible, otherwise returns None."""
+    try:
+        return json_format.Parse(
+            input_str,
+            ContentMap.Contents.pb(ContentMap.Contents()),
+        )
+    except json_format.ParseError as e:
+        if _CONTENTS_DETECTOR in input_str and _PARTS_DETECTOR in input_str:
+            logging.warning(
+                "Failed to parse %s to ContentMap.Contents: %s", input_str, e
+            )
+        return None
+
+
+def _is_multimodal_response(response: str) -> bool:
+    """Checks if the model response contains multimodal input."""
+    content_list = _string_to_content_list(response)
+    if content_list is None:
+        if _CONTENTS_DETECTOR in response and _PARTS_DETECTOR in response:
+            logging.warning(
+                "Response contains multimodal input: %s. Please check whether"
+                " the response format conforms to ContentMap type.",
+                response,
+            )
+        return False
+    else:
+        return True
+
+
+def is_multimodal_instance(
+    model_based_metric_instance_input: Dict[str, str],
+) -> bool:
+    """Checks if the evaluation instance contains multimodal input."""
+    for placeholder in model_based_metric_instance_input:
+        if _is_multimodal_response(model_based_metric_instance_input[placeholder]):
+            return True
+    return False
+
+
+def convert_multimodal_response_to_content_map(
+    model_based_metric_instance_input: Dict[str, str],
+) -> ContentMap:
+    """Converts a multimodal model response to a ContentMap."""
+    content_map = ContentMap()
+    for placeholder in model_based_metric_instance_input.keys():
+        content_list = _string_to_content_list(
+            model_based_metric_instance_input[placeholder]
+        )
+        if content_list is None:
+            content_map.values[placeholder] = ContentMap.Contents(
+                contents=[
+                    Content(
+                        parts=[
+                            Part(text=model_based_metric_instance_input[placeholder])
+                        ]
+                    )
+                ]
+            )
+        else:
+            content_map.values[placeholder] = content_list
+
+    return content_map

Original file line number	Diff line number	Diff line change
`@@ -36,6 +36,7 @@`
`36`	`36`	`from google.cloud.aiplatform_v1beta1.services import (`
`37`	`37`	`evaluation_service as gapic_evaluation_services_preview,`
`38`	`38`	`)`
	`39`	`+from google.cloud.aiplatform_v1beta1.types import content`
`39`	`40`	`from google.cloud.aiplatform_v1beta1.types import (`
`40`	`41`	`evaluation_service as gapic_evaluation_service_types_preview,`
`41`	`42`	`)`
`@@ -70,6 +71,10 @@`
`70`	`71`	`PairwisePreview = (`
`71`	`72`	`evaluation_preview.metrics.metric_prompt_template_examples.MetricPromptTemplateExamples.Pairwise`
`72`	`73`	`)`
	`74`	`+ContentMap = gapic_evaluation_service_types_preview.ContentMap`
	`75`	`+Content = content.Content`
	`76`	`+Part = content.Part`
	`77`	`+`
`73`	`78`
`74`	`79`	`_TEST_PROJECT = "test-project"`
`75`	`80`	`_TEST_LOCATION = "us-central1"`