diff --git a/backend/app/evaluation/README.md b/backend/app/evaluation/README.md index 17ebc1c..60b497b 100644 --- a/backend/app/evaluation/README.md +++ b/backend/app/evaluation/README.md @@ -48,7 +48,8 @@ backend/app/evaluation/ │ └── run.py # PII evaluation script ├── topic_relevance/ │ └── run.py # Topic relevance evaluation script -└── toxicity/ # Toxicity evaluation scripts +└── toxicity/ + └── run.py # Toxicity evaluation script (LlamaGuard7B, NSFWText, ProfanityFree) ``` ## Prerequisites @@ -91,7 +92,7 @@ Validators that use LLM-as-judge approach will require credentials for LLM provi ## Running All Evaluations -To run all individual validator evaluations in sequence (lexical slur, PII, gender assumption bias, ban list, topic relevance): +To run all individual validator evaluations in sequence (lexical slur, PII, gender assumption bias, ban list, topic relevance, toxicity): ```bash bash scripts/run_all_evaluations.sh @@ -262,6 +263,49 @@ python3 app/evaluation/topic_relevance/run.py --- +### Toxicity (`llamaguard_7b`, `nsfw_text`, `profanity_free`) + +**Script:** `app/evaluation/toxicity/run.py` + +**Datasets:** +- `datasets/toxicity/toxicity_test_hasoc.csv` +- `datasets/toxicity/toxicity_test_sharechat.csv` + +Expected columns — HASOC dataset: + +- `text` — tweet/comment text to validate +- `task1` — ground truth label (`HOF` = hate/offensive/profanity → `1`, `NOT` → `0`) +- `lang` — language code (informational) + +Expected columns — ShareChat dataset: + +- `commentText` — comment text to validate +- `label` — binary ground truth (`1` = toxic, `0` = not toxic) +- `language` — language label (informational) + +**What it does:** Runs three validators — `LlamaGuard7B`, `NSFWText`, and `ProfanityFree` — across both datasets independently. For each validator, a binary prediction is recorded (`1` if `FailResult`, `0` otherwise) and compared against the ground truth label to compute classification metrics. + +**Output per dataset:** + +``` +outputs/toxicity/predictions_hasoc.csv +outputs/toxicity/metrics_hasoc.json +outputs/toxicity/predictions_sharechat.csv +outputs/toxicity/metrics_sharechat.json +``` + +Each predictions CSV contains the source text, ground truth (`y_true`), and one `*_pred` column per validator. Each metrics JSON contains accuracy, precision, recall, F1, and performance stats broken down per validator. + +**Run:** + +```bash +python3 app/evaluation/toxicity/run.py +``` + +> **Note:** `LlamaGuard7B` uses remote inferencing — requires a valid `GUARDRAILS_HUB_API_KEY` and internet access. `NSFWText` downloads the `textdetox/xlmr-large-toxicity-classifier` model on first run. + +--- + ## Multiple Validators Evaluation (End-to-End) This evaluation runs multiple validators **together** against a dataset via the live guardrails API. Unlike the individual evaluations above, this is an **end-to-end integration test** — it hits the API rather than calling validators directly. @@ -393,6 +437,7 @@ Each evaluation script expects a specific filename — files must be named exact | Ban List | `ban_list_testing_dataset.csv` | | Multiple Validators | `multi_validator_whatsapp_dataset.csv` | | Topic Relevance | `topic_relevance/education-topic-relevance-dataset.csv`, `topic_relevance/healthcare-topic-relevance-dataset.csv` | +| Toxicity | `toxicity/toxicity_test_hasoc.csv`, `toxicity/toxicity_test_sharechat.csv` | Topic relevance also requires plain-text topic config files alongside each dataset: diff --git a/backend/app/evaluation/toxicity/run.py b/backend/app/evaluation/toxicity/run.py new file mode 100644 index 0000000..5d3d6cb --- /dev/null +++ b/backend/app/evaluation/toxicity/run.py @@ -0,0 +1,98 @@ +from pathlib import Path +import pandas as pd +from guardrails.hub import LlamaGuard7B, NSFWText, ProfanityFree +from guardrails.validators import FailResult + +from app.evaluation.common.helper import ( + build_evaluation_report, + compute_binary_metrics, + Profiler, + write_csv, + write_json, +) + +BASE_DIR = Path(__file__).resolve().parent.parent +OUT_DIR = BASE_DIR / "outputs" / "toxicity" + +DATASETS = { + "hasoc": { + "path": BASE_DIR / "datasets" / "toxicity" / "toxicity_test_hasoc.csv", + "text_col": "text", + "label_col": "task1", + "label_map": {"HOF": 1, "NOT": 0}, + }, + "sharechat": { + "path": BASE_DIR / "datasets" / "toxicity" / "toxicity_test_sharechat.csv", + "text_col": "commentText", + "label_col": "label", + "label_map": None, # already binary int + }, +} + +VALIDATORS = { + "llamaguard_7b": lambda: LlamaGuard7B(on_fail="noop"), + "nsfw_text": lambda: NSFWText( + threshold=0.8, + validation_method="sentence", + device="cpu", + model_name="textdetox/xlmr-large-toxicity-classifier", + on_fail="noop", + ), + "profanity_free": lambda: ProfanityFree(on_fail="noop"), +} + + +def run_dataset(dataset_name: str, dataset_cfg: dict): + df = pd.read_csv(dataset_cfg["path"]) + text_col = dataset_cfg["text_col"] + label_col = dataset_cfg["label_col"] + label_map = dataset_cfg["label_map"] + + if label_map is not None: + df["y_true"] = df[label_col].map(label_map) + else: + df["y_true"] = df[label_col].astype(int) + + all_metrics = {} + + for validator_name, build_fn in VALIDATORS.items(): + print(f" Running {validator_name} on {dataset_name}...") + validator = build_fn() + + with Profiler() as p: + df[f"{validator_name}_result"] = ( + df[text_col] + .astype(str) + .apply( + lambda x: p.record(lambda t: validator.validate(t, metadata={}), x) + ) + ) + + df[f"{validator_name}_pred"] = df[f"{validator_name}_result"].apply( + lambda r: int(isinstance(r, FailResult)) + ) + + metrics = compute_binary_metrics(df["y_true"], df[f"{validator_name}_pred"]) + all_metrics[validator_name] = build_evaluation_report( + guardrail=validator_name, + dataset=dataset_name, + num_samples=len(df), + profiler=p, + metrics=metrics, + ) + + df = df.drop(columns=[f"{validator_name}_result"]) + + pred_cols = ["y_true"] + [f"{v}_pred" for v in VALIDATORS] + write_csv( + df[[text_col, *pred_cols]], + OUT_DIR / f"predictions_{dataset_name}.csv", + ) + write_json(all_metrics, OUT_DIR / f"metrics_{dataset_name}.json") + + +for dataset_name, dataset_cfg in DATASETS.items(): + print(f"Evaluating dataset: {dataset_name}") + run_dataset(dataset_name, dataset_cfg) + +print("Done. Results saved to", OUT_DIR) diff --git a/backend/scripts/run_all_evaluations.sh b/backend/scripts/run_all_evaluations.sh index 0a4de0a..0da2402 100755 --- a/backend/scripts/run_all_evaluations.sh +++ b/backend/scripts/run_all_evaluations.sh @@ -11,6 +11,7 @@ RUNNERS=( "$EVAL_DIR/gender_assumption_bias/run.py" "$EVAL_DIR/ban_list/run.py" "$EVAL_DIR/topic_relevance/run.py" + "$EVAL_DIR/toxicity/run.py" ) echo "Running validator evaluations..."