ProjectTech4DevAI · rkritika1508 · Apr 1, 2026 · Apr 1, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/backend/app/evaluation/README.md b/backend/app/evaluation/README.md
@@ -48,7 +48,8 @@ backend/app/evaluation/
 │   └── run.py                             # PII evaluation script
 ├── topic_relevance/
 │   └── run.py                             # Topic relevance evaluation script
-└── toxicity/                              # Toxicity evaluation scripts
+└── toxicity/
+    └── run.py                             # Toxicity evaluation script (LlamaGuard7B, NSFWText, ProfanityFree)
 ```
 
 ## Prerequisites
@@ -91,7 +92,7 @@ Validators that use LLM-as-judge approach will require credentials for LLM provi
 
 ## Running All Evaluations
 
-To run all individual validator evaluations in sequence (lexical slur, PII, gender assumption bias, ban list, topic relevance):
+To run all individual validator evaluations in sequence (lexical slur, PII, gender assumption bias, ban list, topic relevance, toxicity):
 
 ```bash
 bash scripts/run_all_evaluations.sh
@@ -262,6 +263,49 @@ python3 app/evaluation/topic_relevance/run.py
 
 ---
 
+### Toxicity (`llamaguard_7b`, `nsfw_text`, `profanity_free`)
+
+**Script:** `app/evaluation/toxicity/run.py`
+
+**Datasets:**
+- `datasets/toxicity/toxicity_test_hasoc.csv`
+- `datasets/toxicity/toxicity_test_sharechat.csv`
+
+Expected columns — HASOC dataset:
+
+- `text` — tweet/comment text to validate
+- `task1` — ground truth label (`HOF` = hate/offensive/profanity → `1`, `NOT` → `0`)
+- `lang` — language code (informational)
+
+Expected columns — ShareChat dataset:
+
+- `commentText` — comment text to validate
+- `label` — binary ground truth (`1` = toxic, `0` = not toxic)
+- `language` — language label (informational)
+
+**What it does:** Runs three validators — `LlamaGuard7B`, `NSFWText`, and `ProfanityFree` — across both datasets independently. For each validator, a binary prediction is recorded (`1` if `FailResult`, `0` otherwise) and compared against the ground truth label to compute classification metrics.
+
+**Output per dataset:**
+
+```
+outputs/toxicity/predictions_hasoc.csv
+outputs/toxicity/metrics_hasoc.json
+outputs/toxicity/predictions_sharechat.csv
+outputs/toxicity/metrics_sharechat.json
+```
+
+Each predictions CSV contains the source text, ground truth (`y_true`), and one `*_pred` column per validator. Each metrics JSON contains accuracy, precision, recall, F1, and performance stats broken down per validator.
+
+**Run:**
+
+```bash
+python3 app/evaluation/toxicity/run.py
+```
+
+> **Note:** `LlamaGuard7B` uses remote inferencing — requires a valid `GUARDRAILS_HUB_API_KEY` and internet access. `NSFWText` downloads the `textdetox/xlmr-large-toxicity-classifier` model on first run.
+
+---
+
 ## Multiple Validators Evaluation (End-to-End)
 
 This evaluation runs multiple validators **together** against a dataset via the live guardrails API. Unlike the individual evaluations above, this is an **end-to-end integration test** — it hits the API rather than calling validators directly.
@@ -393,6 +437,7 @@ Each evaluation script expects a specific filename — files must be named exact
 | Ban List               | `ban_list_testing_dataset.csv`                                                                                      |
 | Multiple Validators    | `multi_validator_whatsapp_dataset.csv`                                                                              |
 | Topic Relevance        | `topic_relevance/education-topic-relevance-dataset.csv`, `topic_relevance/healthcare-topic-relevance-dataset.csv` |
+| Toxicity               | `toxicity/toxicity_test_hasoc.csv`, `toxicity/toxicity_test_sharechat.csv`                                        |
 
 Topic relevance also requires plain-text topic config files alongside each dataset:
 

diff --git a/backend/app/evaluation/toxicity/run.py b/backend/app/evaluation/toxicity/run.py
@@ -0,0 +1,98 @@
+from pathlib import Path
+import pandas as pd
+from guardrails.hub import LlamaGuard7B, NSFWText, ProfanityFree
+from guardrails.validators import FailResult
+
+from app.evaluation.common.helper import (
+    build_evaluation_report,
+    compute_binary_metrics,
+    Profiler,
+    write_csv,
+    write_json,
+)
+
+BASE_DIR = Path(__file__).resolve().parent.parent
+OUT_DIR = BASE_DIR / "outputs" / "toxicity"
+
+DATASETS = {
+    "hasoc": {
+        "path": BASE_DIR / "datasets" / "toxicity" / "toxicity_test_hasoc.csv",
+        "text_col": "text",
+        "label_col": "task1",
+        "label_map": {"HOF": 1, "NOT": 0},
+    },
+    "sharechat": {
+        "path": BASE_DIR / "datasets" / "toxicity" / "toxicity_test_sharechat.csv",
+        "text_col": "commentText",
+        "label_col": "label",
+        "label_map": None,  # already binary int
+    },
+}
+
+VALIDATORS = {
+    "llamaguard_7b": lambda: LlamaGuard7B(on_fail="noop"),
+    "nsfw_text": lambda: NSFWText(
+        threshold=0.8,
+        validation_method="sentence",
+        device="cpu",
+        model_name="textdetox/xlmr-large-toxicity-classifier",
+        on_fail="noop",
+    ),
+    "profanity_free": lambda: ProfanityFree(on_fail="noop"),
+}
+
+
+def run_dataset(dataset_name: str, dataset_cfg: dict):
+    df = pd.read_csv(dataset_cfg["path"])
+    text_col = dataset_cfg["text_col"]
+    label_col = dataset_cfg["label_col"]
+    label_map = dataset_cfg["label_map"]
+
+    if label_map is not None:
+        df["y_true"] = df[label_col].map(label_map)
+    else:
+        df["y_true"] = df[label_col].astype(int)
+
+    all_metrics = {}
+
+    for validator_name, build_fn in VALIDATORS.items():
+        print(f"  Running {validator_name} on {dataset_name}...")
+        validator = build_fn()
+
+        with Profiler() as p:
+            df[f"{validator_name}_result"] = (
+                df[text_col]
+                .astype(str)
+                .apply(
+                    lambda x: p.record(lambda t: validator.validate(t, metadata={}), x)
+                )
+            )
+
+        df[f"{validator_name}_pred"] = df[f"{validator_name}_result"].apply(
+            lambda r: int(isinstance(r, FailResult))
+        )
+
+        metrics = compute_binary_metrics(df["y_true"], df[f"{validator_name}_pred"])
+        all_metrics[validator_name] = build_evaluation_report(
+            guardrail=validator_name,
+            dataset=dataset_name,
+            num_samples=len(df),
+            profiler=p,
+            metrics=metrics,
+        )
+
+        df = df.drop(columns=[f"{validator_name}_result"])
+
+    pred_cols = ["y_true"] + [f"{v}_pred" for v in VALIDATORS]
+    write_csv(
+        df[[text_col, *pred_cols]],
+        OUT_DIR / f"predictions_{dataset_name}.csv",
+    )
+    write_json(all_metrics, OUT_DIR / f"metrics_{dataset_name}.json")
+
+
+for dataset_name, dataset_cfg in DATASETS.items():
+    print(f"Evaluating dataset: {dataset_name}")
+    run_dataset(dataset_name, dataset_cfg)
+
+print("Done. Results saved to", OUT_DIR)
diff --git a/backend/scripts/run_all_evaluations.sh b/backend/scripts/run_all_evaluations.sh
@@ -11,6 +11,7 @@ RUNNERS=(
   "$EVAL_DIR/gender_assumption_bias/run.py"
   "$EVAL_DIR/ban_list/run.py"
   "$EVAL_DIR/topic_relevance/run.py"
+  "$EVAL_DIR/toxicity/run.py"
 )
 
 echo "Running validator evaluations..."