diff --git a/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/__init__.py b/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/__init__.py index 83e23ba88552..1c5cdba52088 100644 --- a/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/__init__.py +++ b/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/__init__.py @@ -25,7 +25,14 @@ from .utilities import docai_utilities, gcs_utilities from .wrappers import document, entity, page -__all__ = (document, page, entity, converter, docai_utilities, gcs_utilities) +__all__ = ( + "document", + "page", + "entity", + "converter", + "docai_utilities", + "gcs_utilities", +) class Python37DeprecationWarning(DeprecationWarning): # pragma: NO COVER diff --git a/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/converters/config/bbox_conversion.py b/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/converters/config/bbox_conversion.py index b256fea715d7..fe3a45a9328b 100644 --- a/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/converters/config/bbox_conversion.py +++ b/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/converters/config/bbox_conversion.py @@ -14,6 +14,7 @@ # limitations under the License. # + from typing import Callable, List, Optional from intervaltree import intervaltree @@ -190,16 +191,21 @@ def convert_bbox_to_docproto_bbox(block: Block) -> documentai.BoundingPoly: y_multiplier = 1.0 normalized_vertices: List[documentai.NormalizedVertex] = [] - if block.page_width and block.page_height: + if ( + block.page_width + and block.page_height + and block.docproto_width is not None + and block.docproto_height is not None + ): x_multiplier = _get_multiplier( docproto_coordinate=block.docproto_width, external_coordinate=block.page_width, - input_bbox_units=block.bounding_unit, + input_bbox_units=block.bounding_unit or "normalized", ) y_multiplier = _get_multiplier( docproto_coordinate=block.docproto_height, external_coordinate=block.page_height, - input_bbox_units=block.bounding_unit, + input_bbox_units=block.bounding_unit or "normalized", ) if block.bounding_type == "1": @@ -208,13 +214,13 @@ def convert_bbox_to_docproto_bbox(block: Block) -> documentai.BoundingPoly: for coordinate in block.bounding_box: x = _convert_bbox_units( coordinate[f"{block.bounding_x}"], - input_bbox_units=block.bounding_unit, + input_bbox_units=block.bounding_unit or "normalized", width=block.docproto_width, multiplier=x_multiplier, ) y = _convert_bbox_units( coordinate[f"{block.bounding_y}"], - input_bbox_units=block.bounding_unit, + input_bbox_units=block.bounding_unit or "normalized", height=block.docproto_height, multiplier=y_multiplier, ) @@ -224,18 +230,24 @@ def convert_bbox_to_docproto_bbox(block: Block) -> documentai.BoundingPoly: elif block.bounding_type == "2": # Type 2 : bounding box has 1 (x,y) coordinates for the top left corner # and (width, height) + if not isinstance(block.bounding_box, dict): + raise TypeError("Expected dict for bounding_box in Type 2") x_min = _convert_bbox_units( block.bounding_box[f"{block.bounding_x}"], - input_bbox_units=block.bounding_unit, + input_bbox_units=block.bounding_unit or "normalized", width=block.page_width, multiplier=x_multiplier, ) y_min = _convert_bbox_units( block.bounding_box[f"{block.bounding_y}"], - input_bbox_units=block.bounding_unit, + input_bbox_units=block.bounding_unit or "normalized", width=block.page_height, multiplier=y_multiplier, ) + if block.bounding_width is None or block.bounding_height is None: + raise ValueError( + "bounding_width and bounding_height must be set for Type 2" + ) x_max = x_min + block.bounding_width y_max = y_min + block.bounding_height normalized_vertices.extend( @@ -249,16 +261,18 @@ def convert_bbox_to_docproto_bbox(block: Block) -> documentai.BoundingPoly: elif block.bounding_type == "3": # Type 3 : bounding_box: [x1, y1, x2, y2, x3, y3, x4, y4] + if not isinstance(block.bounding_box, list): + raise TypeError("Expected list for bounding_box in Type 3") for idx in range(0, len(block.bounding_box), 2): x = _convert_bbox_units( block.bounding_box[idx], - input_bbox_units=block.bounding_unit, + input_bbox_units=block.bounding_unit or "normalized", width=block.docproto_width, multiplier=x_multiplier, ) y = _convert_bbox_units( block.bounding_box[idx + 1], - input_bbox_units=block.bounding_unit, + input_bbox_units=block.bounding_unit or "normalized", width=block.docproto_height, multiplier=y_multiplier, ) diff --git a/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/converters/config/block.py b/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/converters/config/block.py index 50fd63cea2a6..2fbf97237a0c 100644 --- a/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/converters/config/block.py +++ b/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/converters/config/block.py @@ -17,12 +17,12 @@ import dataclasses import json from types import SimpleNamespace -from typing import List, Optional, Type +from typing import Any, List, Optional, Type, Union from google.cloud import documentai -def _get_target_object(json_data: any, target_object: str) -> Optional[SimpleNamespace]: +def _get_target_object(json_data: Any, target_object: str) -> Any: r"""Returns SimpleNamespace of target_object. Args: @@ -72,45 +72,39 @@ class Block: page_number: Optional. """ - type_: SimpleNamespace = dataclasses.field(init=True, repr=False) - text: SimpleNamespace = dataclasses.field(init=True, repr=False) - bounding_box: Optional[SimpleNamespace] = dataclasses.field( + type_: Any = dataclasses.field(init=True, repr=False) + text: str = dataclasses.field(init=True, repr=False) + bounding_box: Optional[Union[SimpleNamespace, List[Any]]] = dataclasses.field( init=True, repr=False, default=None ) - block_references: Optional[SimpleNamespace] = dataclasses.field( - init=True, repr=False, default=None - ) - block_id: Optional[SimpleNamespace] = dataclasses.field( - init=False, repr=False, default=None - ) - confidence: Optional[SimpleNamespace] = dataclasses.field( - init=False, repr=False, default=None - ) - page_number: Optional[SimpleNamespace] = dataclasses.field( + block_references: Any = dataclasses.field(init=True, repr=False, default=None) + block_id: Optional[str] = dataclasses.field(init=False, repr=False, default=None) + confidence: Optional[float] = dataclasses.field( init=False, repr=False, default=None ) - page_width: Optional[SimpleNamespace] = dataclasses.field( + page_number: Optional[int] = dataclasses.field(init=False, repr=False, default=None) + page_width: Optional[float] = dataclasses.field( init=False, repr=False, default=None ) - page_height: Optional[SimpleNamespace] = dataclasses.field( + page_height: Optional[float] = dataclasses.field( init=False, repr=False, default=None ) - bounding_width: Optional[SimpleNamespace] = dataclasses.field( + bounding_width: Optional[float] = dataclasses.field( init=False, repr=False, default=None ) - bounding_height: Optional[SimpleNamespace] = dataclasses.field( + bounding_height: Optional[float] = dataclasses.field( init=False, repr=False, default=None ) - bounding_type: Optional[SimpleNamespace] = dataclasses.field( + bounding_type: Optional[str] = dataclasses.field( init=False, repr=False, default=None ) - bounding_unit: Optional[SimpleNamespace] = dataclasses.field( + bounding_unit: Optional[str] = dataclasses.field( init=False, repr=False, default=None ) - bounding_x: Optional[SimpleNamespace] = dataclasses.field( + bounding_x: Optional[float] = dataclasses.field( init=False, repr=False, default=None ) - bounding_y: Optional[SimpleNamespace] = dataclasses.field( + bounding_y: Optional[float] = dataclasses.field( init=False, repr=False, default=None ) docproto_width: Optional[float] = dataclasses.field( @@ -180,6 +174,8 @@ def load_blocks_from_schema( blocks: List[Block] = [] ens = _get_target_object(objects, entities) + if not isinstance(ens, (list, dict)): + raise TypeError("Expected list or dict for entities") for i in ens: entity = i @@ -203,11 +199,13 @@ def load_blocks_from_schema( b = Block( type_=block_type, text=block_text, - bounding_box=_get_target_object(entity, normalized_vertices), + bounding_box=_get_target_object(entity, normalized_vertices) + if normalized_vertices is not None + else None, ) if id_: - b.id_ = _get_target_object(entity, id_) + b.block_id = _get_target_object(entity, id_) if confidence: b.confidence = _get_target_object(entity, confidence) if page_number and page_number in entity: diff --git a/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/converters/converter.py b/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/converters/converter.py index ffc7c1380632..e87b6fb8bca6 100644 --- a/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/converters/converter.py +++ b/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/converters/converter.py @@ -424,7 +424,7 @@ def convert_from_config( print("-------- Converting Started --------") files, labels, did_not_convert = _get_docproto_files( - futures_list, project_id, location, processor_id + list(futures_list), project_id, location, processor_id ) print("-------- Finished Converting --------") diff --git a/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/converters/vision_helpers.py b/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/converters/vision_helpers.py index dbcfd74855e1..77f1c08e7fac 100644 --- a/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/converters/vision_helpers.py +++ b/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/converters/vision_helpers.py @@ -243,17 +243,17 @@ def _generate_entity_annotations( """ entity_annotations: List[EntityAnnotation] = [] for token in page_info.page.tokens: - v: vision.Vertex = [] + v: list[vision.Vertex] = [] if token.layout.bounding_poly.vertices: for vertex in token.layout.bounding_poly.vertices: - v.append({"x": int(vertex.x), "y": int(vertex.y)}) + v.append(vision.Vertex(x=int(vertex.x), y=int(vertex.y))) else: for normalized_vertex in token.layout.bounding_poly.normalized_vertices: v.append( - { - "x": int(normalized_vertex.x * page_info.page.dimension.width), - "y": int(normalized_vertex.y * page_info.page.dimension.height), - } + vision.Vertex( + x=int(normalized_vertex.x * page_info.page.dimension.width), + y=int(normalized_vertex.y * page_info.page.dimension.height), + ) ) text_start_index = token.layout.text_anchor.text_segments[0].start_index diff --git a/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/utilities/gcs_utilities.py b/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/utilities/gcs_utilities.py index 4ed6e90d6fc3..3f2c8a487c22 100644 --- a/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/utilities/gcs_utilities.py +++ b/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/utilities/gcs_utilities.py @@ -20,7 +20,9 @@ from google.api_core.gapic_v1 import client_info -from google.cloud import documentai, documentai_toolbox, storage +from google.cloud import documentai # type: ignore[attr-defined] +from google.cloud import documentai_toolbox +from google.cloud import storage # type: ignore[attr-defined] from google.cloud.documentai_toolbox import constants @@ -91,6 +93,8 @@ def get_blobs( if gcs_uri: gcs_bucket_name, gcs_prefix = split_gcs_uri(gcs_uri) + if gcs_prefix is None: + raise TypeError("gcs_prefix cannot be None") if re.match(constants.FILE_CHECK_REGEX, gcs_prefix): raise ValueError("gcs_prefix cannot contain file types") diff --git a/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/document.py b/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/document.py index 092cf2d9d307..9a811bee0b5f 100644 --- a/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/document.py +++ b/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/document.py @@ -22,7 +22,7 @@ import glob import os import re -from typing import Dict, Iterator, List, Optional, Type, Union +from typing import Any, Dict, Iterable, Iterator, List, Optional, Type, Union from google.api_core.client_options import ClientOptions from google.api_core.operation import from_gapic as operation_from_gapic @@ -51,7 +51,7 @@ def _document_layout_blocks_from_shards( shards: List[documentai.Document], ) -> Iterator[documentai.Document.DocumentLayout.DocumentLayoutBlock]: def extract_blocks( - blocks: List[documentai.Document.DocumentLayout.DocumentLayoutBlock], + blocks: Iterable[documentai.Document.DocumentLayout.DocumentLayoutBlock], ) -> Iterator[documentai.Document.DocumentLayout.DocumentLayoutBlock]: queue = collections.deque(blocks) @@ -325,8 +325,9 @@ def _dict_to_bigquery( bq_client = bigquery.Client( project=project_id, client_info=gcs_utilities._get_client_info() ) + resolved_project_id = project_id or bq_client.project table_ref = bigquery.DatasetReference( - project=project_id, dataset_id=dataset_name + project=resolved_project_id, dataset_id=dataset_name ).table(table_name) job_config = bigquery.LoadJobConfig( @@ -345,7 +346,7 @@ def _dict_to_bigquery( def _apply_text_offset( - documentai_object: Union[Dict[str, Dict], List], text_offset: int + documentai_object: Union[Dict[str, Any], List[Any]], text_offset: int ) -> None: r"""Applies a text offset to all text_segments in `documentai_object`. diff --git a/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/entity.py b/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/entity.py index 02b2e1ba306c..f8e867f59d3c 100644 --- a/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/entity.py +++ b/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/entity.py @@ -63,7 +63,10 @@ class Entity: _image: Optional[Image.Image] = dataclasses.field(init=False, default=None) - def __post_init__(self, page_offset: int) -> None: + def __post_init__(self, page_offset: Optional[int]) -> None: + if page_offset is None: + page_offset = 0 + self.type_ = self.documentai_object.type_ if self.documentai_object.mention_text: diff --git a/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/page.py b/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/page.py index 35a2491e634b..3dd9b9d68876 100644 --- a/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/page.py +++ b/packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/page.py @@ -18,7 +18,7 @@ from abc import ABC import dataclasses from functools import cached_property -from typing import Iterable, List, Optional, Type +from typing import Iterable, List, Optional, Type, TypeVar import pandas as pd @@ -26,6 +26,8 @@ from google.cloud.documentai_toolbox.constants import ElementWithLayout from google.cloud.documentai_toolbox.utilities import docai_utilities +T = TypeVar("T", bound="_BasePageElement") + @dataclasses.dataclass class Table: @@ -180,9 +182,7 @@ def _text_segment(self) -> documentai.Document.TextAnchor.TextSegment: """ return self.documentai_object.layout.text_anchor.text_segments[0] - def _get_children_of_element( - self, potential_children: List["_BasePageElement"] - ) -> List["_BasePageElement"]: + def _get_children_of_element(self, potential_children: List[T]) -> List[T]: """ Filters potential child elements to identify only those fully contained within this element. diff --git a/packages/google-cloud-documentai-toolbox/noxfile.py b/packages/google-cloud-documentai-toolbox/noxfile.py index 4d9415cb22e7..36729f34f270 100644 --- a/packages/google-cloud-documentai-toolbox/noxfile.py +++ b/packages/google-cloud-documentai-toolbox/noxfile.py @@ -480,10 +480,21 @@ def prerelease_deps(session, protobuf_implementation): @nox.session(python=DEFAULT_PYTHON_VERSION) def mypy(session): """Run the type checker.""" - - # TODO(https://github.com/googleapis/google-cloud-python/issues/16014): - # Enable mypy once this bug is fixed. - session.skip("Temporarily skip mypy. See issue 16014") + session.install( + "mypy<1.16.0", + "types-requests", + "types-protobuf", + "pandas-stubs", + ) + session.install("-e", ".") + session.run( + "mypy", + "-p", + "google.cloud.documentai_toolbox", + "--check-untyped-defs", + "--ignore-missing-imports", + *session.posargs, + ) @nox.session(python=DEFAULT_PYTHON_VERSION)