Spaces:
Runtime error
Runtime error
Sync from GitHub via hub-sync
Browse files- evals/run_eval.py +35 -39
evals/run_eval.py
CHANGED
|
@@ -423,18 +423,14 @@ def maybe_write_report(report):
|
|
| 423 |
return str(target)
|
| 424 |
|
| 425 |
|
| 426 |
-
def
|
| 427 |
-
|
| 428 |
from langchain_core.outputs import Generation, LLMResult
|
| 429 |
from ragas.llms.base import BaseRagasLLM
|
| 430 |
|
| 431 |
-
class
|
| 432 |
-
def __init__(self, model: str,
|
| 433 |
-
self.client =
|
| 434 |
-
vertexai=True,
|
| 435 |
-
project=project,
|
| 436 |
-
location=location,
|
| 437 |
-
)
|
| 438 |
self.model = model
|
| 439 |
self.set_run_config(run_config)
|
| 440 |
|
|
@@ -449,35 +445,35 @@ def build_vertex_ragas_llm(run_config):
|
|
| 449 |
|
| 450 |
def _generate_once(self, prompt, n=1, temperature=1e-8, stop=None, callbacks=None):
|
| 451 |
prompt_text = self._prompt_to_text(prompt)
|
| 452 |
-
|
| 453 |
"temperature": 0.0,
|
| 454 |
-
"
|
| 455 |
-
"max_output_tokens": int(os.getenv("EVAL_MAX_OUTPUT_TOKENS", "2048")),
|
| 456 |
-
"response_mime_type": "application/json",
|
| 457 |
}
|
| 458 |
if stop:
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
response = self.client.
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
)
|
| 466 |
|
| 467 |
-
candidates = getattr(response, "candidates", None) or []
|
| 468 |
generations = []
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
elif getattr(response, "text", None):
|
| 477 |
-
generations.append(Generation(text=response.text.strip()))
|
| 478 |
|
| 479 |
if not generations:
|
| 480 |
-
raise RuntimeError("
|
| 481 |
|
| 482 |
return LLMResult(generations=[generations])
|
| 483 |
|
|
@@ -500,12 +496,12 @@ def build_vertex_ragas_llm(run_config):
|
|
| 500 |
callbacks,
|
| 501 |
)
|
| 502 |
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
return
|
| 509 |
|
| 510 |
|
| 511 |
def build_ragas_embeddings(run_config):
|
|
@@ -572,14 +568,14 @@ def run_ragas(rows, outputs):
|
|
| 572 |
max_wait=int(os.getenv("EVAL_MAX_WAIT_SECONDS", "60")),
|
| 573 |
)
|
| 574 |
log(
|
| 575 |
-
"Using
|
| 576 |
-
f"({os.getenv('EVAL_MODEL', os.getenv('
|
| 577 |
)
|
| 578 |
log(
|
| 579 |
f"RAGAS runtime: async={RAGAS_ASYNC}, raise_exceptions={RAGAS_RAISE_EXCEPTIONS}, "
|
| 580 |
f"timeout={timeout_seconds}s, thread_timeout={thread_timeout_seconds}s, max_workers={max_workers}"
|
| 581 |
)
|
| 582 |
-
llm =
|
| 583 |
embeddings = build_ragas_embeddings(run_config)
|
| 584 |
ragas_report = evaluate(
|
| 585 |
build_ragas_dataset(),
|
|
|
|
| 423 |
return str(target)
|
| 424 |
|
| 425 |
|
| 426 |
+
def build_bedrock_ragas_llm(run_config):
|
| 427 |
+
import boto3
|
| 428 |
from langchain_core.outputs import Generation, LLMResult
|
| 429 |
from ragas.llms.base import BaseRagasLLM
|
| 430 |
|
| 431 |
+
class BedrockRagasLLM(BaseRagasLLM):
|
| 432 |
+
def __init__(self, model: str, region: str, run_config):
|
| 433 |
+
self.client = boto3.client("bedrock-runtime", region_name=region)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
self.model = model
|
| 435 |
self.set_run_config(run_config)
|
| 436 |
|
|
|
|
| 445 |
|
| 446 |
def _generate_once(self, prompt, n=1, temperature=1e-8, stop=None, callbacks=None):
|
| 447 |
prompt_text = self._prompt_to_text(prompt)
|
| 448 |
+
inference_config = {
|
| 449 |
"temperature": 0.0,
|
| 450 |
+
"maxTokens": int(os.getenv("EVAL_MAX_OUTPUT_TOKENS", "2048")),
|
|
|
|
|
|
|
| 451 |
}
|
| 452 |
if stop:
|
| 453 |
+
inference_config["stopSequences"] = stop
|
| 454 |
+
|
| 455 |
+
response = self.client.converse(
|
| 456 |
+
modelId=self.model,
|
| 457 |
+
messages=[
|
| 458 |
+
{
|
| 459 |
+
"role": "user",
|
| 460 |
+
"content": [{"text": prompt_text}],
|
| 461 |
+
}
|
| 462 |
+
],
|
| 463 |
+
inferenceConfig=inference_config,
|
| 464 |
)
|
| 465 |
|
|
|
|
| 466 |
generations = []
|
| 467 |
+
output_message = (response.get("output") or {}).get("message") or {}
|
| 468 |
+
content_blocks = output_message.get("content") or []
|
| 469 |
+
text = "".join(
|
| 470 |
+
block.get("text", "") for block in content_blocks if isinstance(block, dict)
|
| 471 |
+
).strip()
|
| 472 |
+
if text:
|
| 473 |
+
generations.append(Generation(text=text))
|
|
|
|
|
|
|
| 474 |
|
| 475 |
if not generations:
|
| 476 |
+
raise RuntimeError("AWS Bedrock judge returned an empty response.")
|
| 477 |
|
| 478 |
return LLMResult(generations=[generations])
|
| 479 |
|
|
|
|
| 496 |
callbacks,
|
| 497 |
)
|
| 498 |
|
| 499 |
+
region = os.getenv("AWS_REGION", os.getenv("AWS_DEFAULT_REGION", "us-east-1"))
|
| 500 |
+
model = os.getenv(
|
| 501 |
+
"EVAL_MODEL",
|
| 502 |
+
os.getenv("BEDROCK_EVAL_MODEL", "us.anthropic.claude-haiku-4-5-20251001"),
|
| 503 |
+
)
|
| 504 |
+
return BedrockRagasLLM(model=model, region=region, run_config=run_config)
|
| 505 |
|
| 506 |
|
| 507 |
def build_ragas_embeddings(run_config):
|
|
|
|
| 568 |
max_wait=int(os.getenv("EVAL_MAX_WAIT_SECONDS", "60")),
|
| 569 |
)
|
| 570 |
log(
|
| 571 |
+
"Using AWS Bedrock for RAGAS judge model "
|
| 572 |
+
f"({os.getenv('EVAL_MODEL', os.getenv('BEDROCK_EVAL_MODEL', 'us.anthropic.claude-haiku-4-5-20251001'))})"
|
| 573 |
)
|
| 574 |
log(
|
| 575 |
f"RAGAS runtime: async={RAGAS_ASYNC}, raise_exceptions={RAGAS_RAISE_EXCEPTIONS}, "
|
| 576 |
f"timeout={timeout_seconds}s, thread_timeout={thread_timeout_seconds}s, max_workers={max_workers}"
|
| 577 |
)
|
| 578 |
+
llm = build_bedrock_ragas_llm(run_config)
|
| 579 |
embeddings = build_ragas_embeddings(run_config)
|
| 580 |
ragas_report = evaluate(
|
| 581 |
build_ragas_dataset(),
|