diff --git a/docs/howto/define-schemas.rst b/docs/howto/define-schemas.rst new file mode 100644 index 000000000..bae348997 --- /dev/null +++ b/docs/howto/define-schemas.rst @@ -0,0 +1,156 @@ +Define Schemas +============== + +:doc:`Transforms ` can define schemas to validate task data at each step of the +pipeline. Taskgraph uses `msgspec`_ under the hood, and provides a +:class:`~taskgraph.util.schema.Schema` base class that integrates with +:meth:`TransformSequence.add_validate() `. + +There are two ways to define a schema: the **class based** approach and the +**dict based** approach. Both produce equivalent results; which one you prefer +is a matter of style. + +.. _msgspec: https://jcristharif.com/msgspec/ + + +Class Based Schemas +------------------- + +Subclass :class:`~taskgraph.util.schema.Schema` and declare fields as class +attributes with type annotations: + +.. code-block:: python + + from typing import Optional + from taskgraph.transforms.base import TransformSequence + from taskgraph.util.schema import Schema + + class MySubConfig(Schema): + total_num: int + fields: list[str] = [] + + class MySchema(Schema, forbid_unknown_fields=False): + config: Optional[MySubConfig] = None + + transforms = TransformSequence() + transforms.add_validate(MySchema) + +A few things to note: + +- Field names use ``snake_case`` in Python but are **automatically renamed to + ``kebab-case``** in YAML. So ``total_num`` in Python matches + ``total-num`` in YAML. +- ``Optional[T]`` fields default to ``None`` unless you supply an explicit + default. +- Fields without a default are **required**. +- ``forbid_unknown_fields=True`` (the default) causes validation to fail if the + task data contains keys that are not declared in the schema. Set it to + ``False`` on outer schemas so that fields belonging to later transforms are + not rejected. + + +Dict Based Schemas +------------------ + +Call :meth:`Schema.from_dict() ` with a +dictionary mapping field names to ``type`` or ``(type, default)`` tuples: + +.. code-block:: python + + from typing import Optional, Union + from taskgraph.transforms.base import TransformSequence + from taskgraph.util.schema import Schema + + MySchema = Schema.from_dict( + { + "config": Schema.from_dict( + { + "total-num": int, + "fields": list[str] = [] + }, + optional=True, + ), + }, + forbid_unknown_fields=False, + ) + + transforms = TransformSequence() + transforms.add_validate(MySchema) + +This example is equivalent to the first example. One advantage with the dict based approach +is that you can write keys in **kebab-case** directly. + +Field specifications follow these rules: + +- A bare type (e.g. ``str``) means the field is required. +- ``Optional[T]`` means the field is optional and defaults to ``None``. +- A ``(type, default)`` tuple supplies an explicit default, e.g. + ``(list[str], [])``. + +Keyword arguments to ``from_dict`` are forwarded to ``msgspec.defstruct``. +The most commonly used ones are ``name`` (for better error messages) and +``forbid_unknown_fields``. + +.. note:: + ``Schema.from_dict`` does **not** apply ``rename="kebab"`` automatically, + because you can express the kebab-case names directly in the dict keys. + Underscores in dict keys stay as underscores and dashes become valid + kebab-case field names. + + +Nesting Schemas +--------------- + +Both approaches support nesting: + +.. code-block:: python + + # Class-based nesting + class Inner(Schema): + value: str + + class Outer(Schema, forbid_unknown_fields=False, kw_only=True): + inner: Optional[Inner] = None + + # Dict-based nesting + Outer = Schema.from_dict( + { + "inner": Schema.from_dict({"value": str}, optional=True), + }, + forbid_unknown_fields=False, + ) + +Pass ``optional=True`` to ``from_dict`` to make the whole nested schema +optional. This is necessary as function calls are not allowed in type +annotations, so ``Optional[Schema.from_dict(...)]`` is not valid Python. + + +Mutually Exclusive Fields +------------------------- + +Use the ``exclusive`` keyword to declare groups of fields where at most one +may be set at a time: + +.. code-block:: python + + # Class-based + class MySchema(Schema, exclusive=[["field_a", "field_b"]]): + field_a: Optional[str] = None + field_b: Optional[str] = None + + # Dict-based + MySchema = Schema.from_dict( + { + "field-a": Optional[str], + "field-b": Optional[str], + }, + exclusive=[["field_a", "field_b"]], + ) + +``exclusive`` takes a list of groups, where each group is a list of field +names (Python ``snake_case``). A validation error is raised if more than one +field in a group is set. + +.. note:: + When using ``exclusive`` with the dict-based approach, refer to fields by + their Python attribute names (``snake_case``), not their YAML keys. diff --git a/docs/howto/index.rst b/docs/howto/index.rst index d8bae2356..afc06a674 100644 --- a/docs/howto/index.rst +++ b/docs/howto/index.rst @@ -10,6 +10,7 @@ A collection of how-to guides. run-locally debugging bootstrap-taskgraph + define-schemas resolve-keyed-by use-fetches docker diff --git a/src/taskgraph/transforms/from_deps.py b/src/taskgraph/transforms/from_deps.py index 101579394..d0e1ba3cc 100644 --- a/src/taskgraph/transforms/from_deps.py +++ b/src/taskgraph/transforms/from_deps.py @@ -22,46 +22,46 @@ from taskgraph.util.schema import Schema, validate_schema from taskgraph.util.set_name import SET_NAME_MAP - -class FromDepsConfig(Schema): - # Limit dependencies to specified kinds (defaults to all kinds in - # `kind-dependencies`). - # - # The first kind in the list is the "primary" kind. The - # dependency of this kind will be used to derive the label - # and copy attributes (if `copy-attributes` is True). - kinds: Optional[list[str]] = None - # Set-name function (dynamic: validated at runtime against SET_NAME_MAP). - set_name: Optional[Union[bool, str, dict[str, object]]] = None - # Limit dependencies to tasks whose attributes match - # using :func:`~taskgraph.util.attributes.attrmatch`. - with_attributes: Optional[dict[str, Union[list, str]]] = None - # Group cross-kind dependencies using the given group-by - # function. One task will be created for each group. If not - # specified, the 'single' function will be used which creates - # a new task for each individual dependency. - group_by: Optional[Union[str, dict[str, object]]] = None - # If True, copy attributes from the dependency matching the - # first kind in the `kinds` list (whether specified explicitly - # or taken from `kind-dependencies`). - copy_attributes: Optional[bool] = None - # If true (the default), there must be only a single unique task - # for each kind in a dependency group. Setting this to false - # disables that requirement. - unique_kinds: Optional[bool] = None - # If present, a `fetches` entry will be added for each task - # dependency. Attributes of the upstream task may be used as - # substitution values in the `artifact` or `dest` values of the - # `fetches` entry. - fetches: Optional[dict[str, list[FetchesEntrySchema]]] = None - - -#: Schema for from_deps transforms -class FromDepsSchema(Schema, forbid_unknown_fields=False, kw_only=True): - from_deps: FromDepsConfig - - -FROM_DEPS_SCHEMA = FromDepsSchema +FROM_DEPS_SCHEMA = Schema.from_dict( + { + "from-deps": Schema.from_dict( + { + # Limit dependencies to specified kinds (defaults to all kinds in + # `kind-dependencies`). + # + # The first kind in the list is the "primary" kind. The + # dependency of this kind will be used to derive the label + # and copy attributes (if `copy-attributes` is True). + "kinds": Optional[list[str]], + # Set-name function (dynamic: validated at runtime against SET_NAME_MAP). + "set-name": Optional[Union[bool, str, dict[str, object]]], + # Limit dependencies to tasks whose attributes match + # using :func:`~taskgraph.util.attributes.attrmatch`. + "with-attributes": Optional[dict[str, Union[list, str]]], + # Group cross-kind dependencies using the given group-by + # function. One task will be created for each group. If not + # specified, the 'single' function will be used which creates + # a new task for each individual dependency. + "group-by": Optional[Union[str, dict[str, object]]], + # If True, copy attributes from the dependency matching the + # first kind in the `kinds` list (whether specified explicitly + # or taken from `kind-dependencies`). + "copy-attributes": Optional[bool], + # If true (the default), there must be only a single unique task + # for each kind in a dependency group. Setting this to false + # disables that requirement. + "unique-kinds": Optional[bool], + # If present, a `fetches` entry will be added for each task + # dependency. Attributes of the upstream task may be used as + # substitution values in the `artifact` or `dest` values of the + # `fetches` entry. + "fetches": Optional[dict[str, list[FetchesEntrySchema]]], + }, + ), + }, + name="FromDepsSchema", + forbid_unknown_fields=False, +) transforms = TransformSequence() transforms.add_validate(FROM_DEPS_SCHEMA) diff --git a/src/taskgraph/util/schema.py b/src/taskgraph/util/schema.py index ad9a93e5a..3f9350668 100644 --- a/src/taskgraph/util/schema.py +++ b/src/taskgraph/util/schema.py @@ -2,6 +2,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +import inspect import pprint import re import threading @@ -318,6 +319,11 @@ def __getitem__(self, item): return self.schema[item] # type: ignore +def _caller_module_name(depth=1): + frame = inspect.stack()[depth + 1].frame + return frame.f_globals.get("__name__", "schema") + + class Schema( msgspec.Struct, kw_only=True, @@ -345,6 +351,11 @@ class MySchema(Schema, forbid_unknown_fields=False, kw_only=True): foo: str """ + def __init_subclass__(cls, exclusive=None, **kwargs): + super().__init_subclass__(**kwargs) + if exclusive is not None: + cls.exclusive = exclusive + def __post_init__(self): if taskgraph.fast: return @@ -370,6 +381,76 @@ def __post_init__(self): keyed_by.validate(obj) + # Validate mutually exclusive field groups. + for group in getattr(self, "exclusive", []): + set_fields = [f for f in group if getattr(self, f) is not None] + if len(set_fields) > 1: + raise ValueError( + f"{' and '.join(repr(f) for f in set_fields)} are mutually exclusive" + ) + + @classmethod + def from_dict( + cls, + fields_dict: dict[str, Any], + name: Optional[str] = None, + optional: bool = False, + **kwargs, + ) -> Union[type[msgspec.Struct], type[Optional[msgspec.Struct]]]: + """Create a Schema subclass dynamically from a dict of field definitions. + + Each key is a field name and each value is either a type annotation or a + ``(type, default)`` tuple. Fields typed as ``Optional[...]`` automatically + receive a default of ``None`` when no explicit default is provided. + + Usage:: + + Schema.from_dict("MySchema", { + "required_field": str, + "optional_field": Optional[int], # default None inferred + "explicit_default": (list[str], []), # explicit default + }) + + Keyword arguments are forwarded to ``msgspec.defstruct`` (e.g. + ``forbid_unknown_fields=False``). + """ + # Don't use `rename=kebab` by default as we can define kebab case + # properly in dicts. + kwargs.setdefault("rename", None) + + # Ensure name and module are set correctly for error messages. + caller_module = _caller_module_name() + kwargs.setdefault("module", caller_module) + name = name or caller_module.rsplit(".", 1)[-1] + + fields = [] + for field_name, field_spec in fields_dict.items(): + python_name = field_name.replace("-", "_") + + if isinstance(field_spec, tuple): + typ, default = field_spec + else: + typ = field_spec + if get_origin(typ) is Union and type(None) in get_args(typ): + default = None + else: + default = msgspec.NODEFAULT + + if field_name != python_name: + # Use msgspec.field to preserve the kebab-case encoded name. + # Explicit field names take priority over the struct-level rename. + fields.append( + (python_name, typ, msgspec.field(name=field_name, default=default)) + ) + else: + fields.append((python_name, typ, default)) + + exclusive = kwargs.pop("exclusive", None) + result = msgspec.defstruct(name or "Schema", fields, bases=(cls,), **kwargs) + if exclusive: + result.exclusive = exclusive # type: ignore[attr-defined] + return Optional[result] if optional else result # type: ignore[valid-type] + @classmethod def validate(cls, data): """Validate data against this schema.""" diff --git a/test/test_util_schema.py b/test/test_util_schema.py index e0492ae43..cb67412ca 100644 --- a/test/test_util_schema.py +++ b/test/test_util_schema.py @@ -3,6 +3,7 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. import unittest +from typing import Optional import msgspec import pytest @@ -348,3 +349,53 @@ class TestSchema(Schema): with pytest.raises(msgspec.ValidationError): TestSchema.validate({"field": {"by-foo": {"a": "b"}}}) + + +@pytest.mark.parametrize( + "fields_dict, data, attr, expected", + [ + ({"name": str}, {"name": "foo"}, "name", "foo"), + ({"count": Optional[int]}, {}, "count", None), + ({"tags": (list, [])}, {}, "tags", []), + ({"my-field": str}, {"my-field": "bar"}, "my_field", "bar"), + ], +) +def test_from_dict_valid(fields_dict, data, attr, expected): + S = Schema.from_dict(fields_dict) + result = msgspec.convert(data, S) + assert getattr(result, attr) == expected + + +@pytest.mark.parametrize( + "fields_dict, data", + [ + ({"name": str}, {}), + ({"my-field": str}, {"my_field": "bar"}), + ], +) +def test_from_dict_invalid(fields_dict, data): + S = Schema.from_dict(fields_dict) + with pytest.raises(msgspec.ValidationError): + msgspec.convert(data, S) + + +@pytest.mark.parametrize( + "data, raises", + [ + ({"a": "x", "b": "y"}, True), + ({"a": "x"}, False), + ({}, False), + ], +) +def test_exclusive(data, raises): + S = Schema.from_dict( + {"a": Optional[str], "b": Optional[str]}, + exclusive=[["a", "b"]], + ) + if raises: + with pytest.raises( + (ValueError, msgspec.ValidationError), match="mutually exclusive" + ): + msgspec.convert(data, S) + else: + msgspec.convert(data, S)