Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/google/adk/evaluation/eval_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,19 @@ class BaseCriterion(BaseModel):
description="The threshold to be used by the metric.",
)

include_intermediate_responses_in_final: bool = Field(
default=False,
description=(
"Whether to evaluate the full agent response including intermediate"
" natural language text (e.g. text emitted before tool calls) in"
" addition to the final response. By default, only the final"
" response text is sent to the judge. When True, text from all"
" intermediate invocation events is concatenated with the final"
" response before evaluation. This is useful for agents that emit"
" text both before and after tool calls within a single invocation."
),
)


class LlmAsAJudgeCriterion(BaseCriterion):
"""Criterion when using LLM-As-A-Judge metric."""
Expand Down
34 changes: 33 additions & 1 deletion src/google/adk/evaluation/llm_as_judge_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
from .common import EvalBaseModel
from .eval_case import get_all_tool_calls_with_responses
from .eval_case import IntermediateDataType
from .eval_case import Invocation
from .eval_case import InvocationEvents
from .eval_metrics import RubricScore
from .evaluator import EvalStatus

Expand All @@ -44,8 +46,38 @@ class Label(enum.Enum):


def get_text_from_content(
content: Optional[genai_types.Content],
content: Optional[Union[genai_types.Content, Invocation]],
*,
include_intermediate_responses_in_final: bool = False,
) -> Optional[str]:
"""Extracts text from a `Content` or an `Invocation`.

When `content` is a `Content`, returns the concatenated text of its parts.

When `content` is an `Invocation`, returns the text of the invocation's final
response. If `include_intermediate_responses_in_final` is True, text from
intermediate invocation events (e.g. natural language emitted before tool
calls) is concatenated with the final response text.
"""
if isinstance(content, Invocation):
if not include_intermediate_responses_in_final:
# Flag off: revert to basic plain-Content behavior.
return get_text_from_content(content.final_response)

parts: list[str] = []
if isinstance(content.intermediate_data, InvocationEvents):
# Walk intermediate events in order; collect text parts.
for event in content.intermediate_data.invocation_events:
text = get_text_from_content(event.content)
if text:
parts.append(text)
# Then fetch the final response text and append it to the end.
final_text = get_text_from_content(content.final_response)
if final_text:
parts.append(final_text)

return "\n".join(parts) if parts else None

if content and content.parts:
return "\n".join([p.text for p in content.parts if p.text])

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,18 @@ def format_auto_rater_prompt(
"""Returns the autorater prompt."""
self.create_effective_rubrics_list(actual_invocation.rubrics)
user_input = get_text_from_content(actual_invocation.user_content)
final_response = get_text_from_content(actual_invocation.final_response)

criterion = self._eval_metric.criterion
include_intermediate = getattr(
criterion, "include_intermediate_responses_in_final", False
)
final_response = (
get_text_from_content(
actual_invocation,
include_intermediate_responses_in_final=include_intermediate,
)
or ""
)

rubrics_text = "\n".join([
f"* {r.rubric_content.text_property}"
Expand Down
44 changes: 44 additions & 0 deletions tests/unittests/evaluation/test_llm_as_judge_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from google.adk.evaluation.app_details import AgentDetails
from google.adk.evaluation.app_details import AppDetails
from google.adk.evaluation.eval_case import IntermediateData
from google.adk.evaluation.eval_case import Invocation
from google.adk.evaluation.eval_case import InvocationEvent
from google.adk.evaluation.eval_case import InvocationEvents
from google.adk.evaluation.eval_rubrics import RubricScore
Expand Down Expand Up @@ -88,6 +89,49 @@ def test_get_text_from_content_with_mixed_parts():
assert get_text_from_content(content) == "Hello\nWorld"


def test_get_text_from_content_with_invocation_include_intermediate_responses_in_final():
"""Tests get_text_from_content on an Invocation with and without the flag."""
intermediate_text = "Let me check."
final_response_text = "Done."
invocation = Invocation(
user_content=genai_types.Content(parts=[genai_types.Part(text="user")]),
intermediate_data=InvocationEvents(
invocation_events=[
InvocationEvent(
author="agent",
content=genai_types.Content(
parts=[genai_types.Part(text=intermediate_text)]
),
),
InvocationEvent(
author="tool",
content=genai_types.Content(
parts=[
genai_types.Part(
function_call=genai_types.FunctionCall(name="t")
)
]
),
),
]
),
final_response=genai_types.Content(
parts=[genai_types.Part(text=final_response_text)]
),
)

# Flag off (default): only the final response text is returned.
assert get_text_from_content(invocation) == final_response_text

# Flag on: intermediate text is concatenated before the final response.
assert (
get_text_from_content(
invocation, include_intermediate_responses_in_final=True
)
== f"{intermediate_text}\n{final_response_text}"
)


def test_get_eval_status_with_none_score():
"""Tests get_eval_status returns NOT_EVALUATED for a None score."""
assert get_eval_status(score=None, threshold=0.5) == EvalStatus.NOT_EVALUATED
Expand Down
Loading