From 1eabb0020a5975e6ca960203883d46d137edbe0c Mon Sep 17 00:00:00 2001 From: nishika26 Date: Fri, 17 Apr 2026 15:06:48 +0530 Subject: [PATCH 1/6] default file size and addding documentation --- backend/app/api/docs/documents/upload.md | 1 + .../services/collections/create_collection.py | 7 +++--- backend/app/services/collections/helpers.py | 22 ++++++++++++++++++- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/backend/app/api/docs/documents/upload.md b/backend/app/api/docs/documents/upload.md index e667015f5..c4c06caa6 100644 --- a/backend/app/api/docs/documents/upload.md +++ b/backend/app/api/docs/documents/upload.md @@ -1,6 +1,7 @@ Upload a document to Kaapi. - If only a file is provided, the document will be uploaded and stored, and its ID will be returned. +- The maximum file size allowed for upload is 25 MB. - If a target format is specified, a transformation job will also be created to transform document into target format in the background. The response will include both the uploaded document details and information about the transformation job. - If a callback URL is provided, you will receive a notification at that URL once the document transformation job is completed. diff --git a/backend/app/services/collections/create_collection.py b/backend/app/services/collections/create_collection.py index eb37fd039..d12b7be3f 100644 --- a/backend/app/services/collections/create_collection.py +++ b/backend/app/services/collections/create_collection.py @@ -22,6 +22,7 @@ CreationRequest, ) from app.services.collections.helpers import ( + calculate_total_size_kb, extract_error_message, to_collection_public, ) @@ -156,6 +157,7 @@ def execute_job( result = None creation_request = None provider = None + storage = None try: creation_request = CreationRequest(**request) @@ -169,9 +171,10 @@ def execute_job( with Session(engine) as session: document_crud = DocumentCrud(session, project_id) flat_docs = document_crud.read_each(creation_request.documents) + storage = get_cloud_storage(session=session, project_id=project_id) file_exts = {doc.fname.split(".")[-1] for doc in flat_docs if "." in doc.fname} - total_size_kb = sum(doc.file_size_kb or 0 for doc in flat_docs) + total_size_kb = calculate_total_size_kb(flat_docs, storage) total_size_mb = round(total_size_kb / 1024, 2) with Session(engine) as session: @@ -186,8 +189,6 @@ def execute_job( ), ) - storage = get_cloud_storage(session=session, project_id=project_id) - provider = get_llm_provider( session=session, provider=creation_request.provider, diff --git a/backend/app/services/collections/helpers.py b/backend/app/services/collections/helpers.py index 6985ac78e..66f9dc1c0 100644 --- a/backend/app/services/collections/helpers.py +++ b/backend/app/services/collections/helpers.py @@ -2,6 +2,7 @@ import json import ast import re +from typing import TYPE_CHECKING from uuid import UUID from fastapi import HTTPException @@ -11,6 +12,9 @@ from app.api.deps import SessionDep from app.models import DocumentCollection, Collection, CollectionPublic, Document +if TYPE_CHECKING: + from app.core.cloud.storage import CloudStorage + logger = logging.getLogger(__name__) @@ -63,6 +67,22 @@ def extract_error_message(err: Exception) -> str: return message.strip()[:1000] +def calculate_total_size_kb(documents: list[Document], storage: CloudStorage) -> float: + """ + Sum document sizes in KB. Uses the stored file_size_kb if available. + """ + total: float = 0 + for doc in documents: + if doc.file_size_kb is not None: + total += doc.file_size_kb + else: + logger.info( + f"[calculate_total_size_kb] file_size_kb missing, fetching from storage | {{'doc_id': '{doc.id}', 'fname': '{doc.fname}'}}" + ) + total += storage.get_file_size_kb(doc.object_store_url) + return total + + def batch_documents(documents: list[Document]) -> list[list[Document]]: """ Batch documents dynamically based on size and count limits. @@ -83,7 +103,7 @@ def batch_documents(documents: list[Document]) -> list[list[Document]]: current_batch_size_kb = 0 for doc in documents: - doc_size_kb = doc.file_size_kb or 0 + doc_size_kb = doc.file_size_kb or 15 * 1024 would_exceed_size = (current_batch_size_kb + doc_size_kb) > MAX_BATCH_SIZE_KB would_exceed_count = len(current_batch) >= MAX_BATCH_COUNT From 7f5d86f45af99e6b8133d9954158377c41106df1 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Fri, 17 Apr 2026 15:09:09 +0530 Subject: [PATCH 2/6] default file size and addding documentation --- backend/app/api/docs/documents/upload.md | 3 +-- backend/app/services/collections/create_collection.py | 1 - backend/app/services/collections/helpers.py | 1 - 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/backend/app/api/docs/documents/upload.md b/backend/app/api/docs/documents/upload.md index c4c06caa6..438dc3e9b 100644 --- a/backend/app/api/docs/documents/upload.md +++ b/backend/app/api/docs/documents/upload.md @@ -1,7 +1,6 @@ Upload a document to Kaapi. -- If only a file is provided, the document will be uploaded and stored, and its ID will be returned. -- The maximum file size allowed for upload is 25 MB. +- If only a file is provided, the document will be uploaded and stored, and its ID will be returned. The maximum file size allowed for upload is 25 MB. - If a target format is specified, a transformation job will also be created to transform document into target format in the background. The response will include both the uploaded document details and information about the transformation job. - If a callback URL is provided, you will receive a notification at that URL once the document transformation job is completed. diff --git a/backend/app/services/collections/create_collection.py b/backend/app/services/collections/create_collection.py index d12b7be3f..009d55fd1 100644 --- a/backend/app/services/collections/create_collection.py +++ b/backend/app/services/collections/create_collection.py @@ -152,7 +152,6 @@ def execute_job( """ start_time = time.time() - # Keeping the references for potential backout/cleanup on failure collection_job = None result = None creation_request = None diff --git a/backend/app/services/collections/helpers.py b/backend/app/services/collections/helpers.py index 66f9dc1c0..1b0ae0ace 100644 --- a/backend/app/services/collections/helpers.py +++ b/backend/app/services/collections/helpers.py @@ -23,7 +23,6 @@ MAX_DOC_SIZE_MB = 25 # 25 MB maximum per document # Maximum batch size for uploading documents to vector store -# Derived from MAX_DOC_SIZE + buffer to ensure single docs always fit MAX_BATCH_SIZE_KB = (MAX_DOC_SIZE_MB + 5) * 1024 # 30 MB in KB (25 + 5 MB buffer) MAX_BATCH_COUNT = 200 # Maximum documents per batch From 8e3d29d2093f488d7c54a0ec2f1708c05d7023b7 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Fri, 17 Apr 2026 15:30:57 +0530 Subject: [PATCH 3/6] coderabbit reviews --- .../services/collections/create_collection.py | 19 +++++++++++++++- backend/app/services/collections/helpers.py | 22 +------------------ 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/backend/app/services/collections/create_collection.py b/backend/app/services/collections/create_collection.py index 009d55fd1..887208e18 100644 --- a/backend/app/services/collections/create_collection.py +++ b/backend/app/services/collections/create_collection.py @@ -22,7 +22,6 @@ CreationRequest, ) from app.services.collections.helpers import ( - calculate_total_size_kb, extract_error_message, to_collection_public, ) @@ -136,6 +135,24 @@ def _mark_job_failed( return None +def calculate_total_size_kb( + documents: list[Document], storage: "CloudStorage" +) -> float: + """ + Sum document sizes in KB. Uses the stored file_size_kb if available. + """ + total: float = 0 + for doc in documents: + if doc.file_size_kb is not None: + total += doc.file_size_kb + else: + logger.info( + f"[calculate_total_size_kb] file_size_kb missing, fetching from storage | {{'doc_id': '{doc.id}', 'fname': '{doc.fname}'}}" + ) + total += storage.get_file_size_kb(doc.object_store_url) + return total + + def execute_job( request: dict, with_assistant: bool, diff --git a/backend/app/services/collections/helpers.py b/backend/app/services/collections/helpers.py index 1b0ae0ace..db972c92d 100644 --- a/backend/app/services/collections/helpers.py +++ b/backend/app/services/collections/helpers.py @@ -2,7 +2,6 @@ import json import ast import re -from typing import TYPE_CHECKING from uuid import UUID from fastapi import HTTPException @@ -12,9 +11,6 @@ from app.api.deps import SessionDep from app.models import DocumentCollection, Collection, CollectionPublic, Document -if TYPE_CHECKING: - from app.core.cloud.storage import CloudStorage - logger = logging.getLogger(__name__) @@ -66,22 +62,6 @@ def extract_error_message(err: Exception) -> str: return message.strip()[:1000] -def calculate_total_size_kb(documents: list[Document], storage: CloudStorage) -> float: - """ - Sum document sizes in KB. Uses the stored file_size_kb if available. - """ - total: float = 0 - for doc in documents: - if doc.file_size_kb is not None: - total += doc.file_size_kb - else: - logger.info( - f"[calculate_total_size_kb] file_size_kb missing, fetching from storage | {{'doc_id': '{doc.id}', 'fname': '{doc.fname}'}}" - ) - total += storage.get_file_size_kb(doc.object_store_url) - return total - - def batch_documents(documents: list[Document]) -> list[list[Document]]: """ Batch documents dynamically based on size and count limits. @@ -102,7 +82,7 @@ def batch_documents(documents: list[Document]) -> list[list[Document]]: current_batch_size_kb = 0 for doc in documents: - doc_size_kb = doc.file_size_kb or 15 * 1024 + doc_size_kb = doc.file_size_kb if doc.file_size_kb is not None else 15 * 1024 would_exceed_size = (current_batch_size_kb + doc_size_kb) > MAX_BATCH_SIZE_KB would_exceed_count = len(current_batch) >= MAX_BATCH_COUNT From bed1d1a4f13f2cd918e639c0dd288fe9a661dfdc Mon Sep 17 00:00:00 2001 From: nishika26 Date: Fri, 17 Apr 2026 15:35:28 +0530 Subject: [PATCH 4/6] test cases failing --- backend/app/services/collections/create_collection.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/app/services/collections/create_collection.py b/backend/app/services/collections/create_collection.py index 887208e18..bd55c2871 100644 --- a/backend/app/services/collections/create_collection.py +++ b/backend/app/services/collections/create_collection.py @@ -14,6 +14,7 @@ CollectionJobCrud, ) from app.models import ( + Document, CollectionJobStatus, CollectionJob, Collection, @@ -21,6 +22,7 @@ CollectionJobPublic, CreationRequest, ) +from app.core.cloud.storage import CloudStorage from app.services.collections.helpers import ( extract_error_message, to_collection_public, @@ -135,9 +137,7 @@ def _mark_job_failed( return None -def calculate_total_size_kb( - documents: list[Document], storage: "CloudStorage" -) -> float: +def calculate_total_size_kb(documents: list[Document], storage: CloudStorage) -> float: """ Sum document sizes in KB. Uses the stored file_size_kb if available. """ From d02bac8a57d037327e501eb3e74ecfaa46c70c34 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Fri, 17 Apr 2026 17:01:00 +0530 Subject: [PATCH 5/6] changing the logic --- .../services/collections/create_collection.py | 39 +++++++++---------- backend/app/services/collections/helpers.py | 2 +- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/backend/app/services/collections/create_collection.py b/backend/app/services/collections/create_collection.py index bd55c2871..14696b191 100644 --- a/backend/app/services/collections/create_collection.py +++ b/backend/app/services/collections/create_collection.py @@ -14,7 +14,6 @@ CollectionJobCrud, ) from app.models import ( - Document, CollectionJobStatus, CollectionJob, Collection, @@ -22,7 +21,6 @@ CollectionJobPublic, CreationRequest, ) -from app.core.cloud.storage import CloudStorage from app.services.collections.helpers import ( extract_error_message, to_collection_public, @@ -137,22 +135,6 @@ def _mark_job_failed( return None -def calculate_total_size_kb(documents: list[Document], storage: CloudStorage) -> float: - """ - Sum document sizes in KB. Uses the stored file_size_kb if available. - """ - total: float = 0 - for doc in documents: - if doc.file_size_kb is not None: - total += doc.file_size_kb - else: - logger.info( - f"[calculate_total_size_kb] file_size_kb missing, fetching from storage | {{'doc_id': '{doc.id}', 'fname': '{doc.fname}'}}" - ) - total += storage.get_file_size_kb(doc.object_store_url) - return total - - def execute_job( request: dict, with_assistant: bool, @@ -190,10 +172,27 @@ def execute_job( storage = get_cloud_storage(session=session, project_id=project_id) file_exts = {doc.fname.split(".")[-1] for doc in flat_docs if "." in doc.fname} - total_size_kb = calculate_total_size_kb(flat_docs, storage) - total_size_mb = round(total_size_kb / 1024, 2) + + backfill: list[tuple[UUID, float]] = [] + for doc in flat_docs: + if doc.file_size_kb is None: + size_kb = round(storage.get_file_size_kb(doc.object_store_url)) + doc.file_size_kb = size_kb + backfill.append((doc.id, size_kb)) + + total_size_kb = sum( + doc.file_size_kb for doc in flat_docs if doc.file_size_kb is not None + ) + total_size_mb = total_size_kb / 1024 with Session(engine) as session: + if backfill: + document_crud = DocumentCrud(session, project_id) + for doc_id, size_kb in backfill: + doc = document_crud.read_one(doc_id) + doc.file_size_kb = size_kb + document_crud.update(doc) + collection_job_crud = CollectionJobCrud(session, project_id) collection_job = collection_job_crud.read_one(job_uuid) collection_job = collection_job_crud.update( diff --git a/backend/app/services/collections/helpers.py b/backend/app/services/collections/helpers.py index db972c92d..3f0a0cefd 100644 --- a/backend/app/services/collections/helpers.py +++ b/backend/app/services/collections/helpers.py @@ -82,7 +82,7 @@ def batch_documents(documents: list[Document]) -> list[list[Document]]: current_batch_size_kb = 0 for doc in documents: - doc_size_kb = doc.file_size_kb if doc.file_size_kb is not None else 15 * 1024 + doc_size_kb = doc.file_size_kb would_exceed_size = (current_batch_size_kb + doc_size_kb) > MAX_BATCH_SIZE_KB would_exceed_count = len(current_batch) >= MAX_BATCH_COUNT From 8b7556c226449f0f9a52bc66906642bd55fa8068 Mon Sep 17 00:00:00 2001 From: nishika26 Date: Fri, 17 Apr 2026 20:37:30 +0530 Subject: [PATCH 6/6] fixing test cases --- backend/app/services/collections/create_collection.py | 2 +- backend/app/tests/services/collections/test_helpers.py | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/backend/app/services/collections/create_collection.py b/backend/app/services/collections/create_collection.py index 14696b191..25aba0919 100644 --- a/backend/app/services/collections/create_collection.py +++ b/backend/app/services/collections/create_collection.py @@ -200,7 +200,7 @@ def execute_job( CollectionJobUpdate( task_id=task_id, status=CollectionJobStatus.PROCESSING, - total_size_mb=total_size_mb, + total_size_mb=round(total_size_mb, 2), ), ) diff --git a/backend/app/tests/services/collections/test_helpers.py b/backend/app/tests/services/collections/test_helpers.py index 7cddaf305..8b43946a1 100644 --- a/backend/app/tests/services/collections/test_helpers.py +++ b/backend/app/tests/services/collections/test_helpers.py @@ -122,14 +122,12 @@ def test_batch_documents_mixed_size_batching() -> None: assert len(batches[2]) == 1 # 15 MB total -def test_batch_documents_with_none_file_size() -> None: - """Test that documents with None file_size are treated as 0 bytes.""" +def test_batch_documents_with_none_file_size_raises() -> None: + """Test that documents with None file_size raise TypeError — sizes must be backfilled before batching.""" docs = create_fake_documents(10, file_size_kb=None) - batches = helpers.batch_documents(docs) - # All files with None/0 size should fit in one batch (under both limits) - assert len(batches) == 1 - assert len(batches[0]) == 10 + with pytest.raises(TypeError): + helpers.batch_documents(docs) def test_batch_documents_empty_input() -> None: