Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/app/api/docs/documents/upload.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Upload a document to Kaapi.

- If only a file is provided, the document will be uploaded and stored, and its ID will be returned.
- If only a file is provided, the document will be uploaded and stored, and its ID will be returned. The maximum file size allowed for upload is 25 MB.
- If a target format is specified, a transformation job will also be created to transform document into target format in the background. The response will include both the uploaded document details and information about the transformation job.
- If a callback URL is provided, you will receive a notification at that URL once the document transformation job is completed.

Expand Down
28 changes: 22 additions & 6 deletions backend/app/services/collections/create_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,11 +151,11 @@ def execute_job(
"""
start_time = time.time()

# Keeping the references for potential backout/cleanup on failure
collection_job = None
result = None
creation_request = None
provider = None
storage = None

try:
creation_request = CreationRequest(**request)
Expand All @@ -169,25 +169,41 @@ def execute_job(
with Session(engine) as session:
document_crud = DocumentCrud(session, project_id)
flat_docs = document_crud.read_each(creation_request.documents)
storage = get_cloud_storage(session=session, project_id=project_id)

file_exts = {doc.fname.split(".")[-1] for doc in flat_docs if "." in doc.fname}
total_size_kb = sum(doc.file_size_kb or 0 for doc in flat_docs)
total_size_mb = round(total_size_kb / 1024, 2)

backfill: list[tuple[UUID, float]] = []
for doc in flat_docs:
if doc.file_size_kb is None:
size_kb = round(storage.get_file_size_kb(doc.object_store_url))
doc.file_size_kb = size_kb
backfill.append((doc.id, size_kb))

total_size_kb = sum(
doc.file_size_kb for doc in flat_docs if doc.file_size_kb is not None
)
total_size_mb = total_size_kb / 1024

with Session(engine) as session:
if backfill:
document_crud = DocumentCrud(session, project_id)
for doc_id, size_kb in backfill:
doc = document_crud.read_one(doc_id)
doc.file_size_kb = size_kb
document_crud.update(doc)

collection_job_crud = CollectionJobCrud(session, project_id)
collection_job = collection_job_crud.read_one(job_uuid)
collection_job = collection_job_crud.update(
job_uuid,
CollectionJobUpdate(
task_id=task_id,
status=CollectionJobStatus.PROCESSING,
total_size_mb=total_size_mb,
total_size_mb=round(total_size_mb, 2),
),
)

storage = get_cloud_storage(session=session, project_id=project_id)

provider = get_llm_provider(
session=session,
provider=creation_request.provider,
Expand Down
3 changes: 1 addition & 2 deletions backend/app/services/collections/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
MAX_DOC_SIZE_MB = 25 # 25 MB maximum per document

# Maximum batch size for uploading documents to vector store
# Derived from MAX_DOC_SIZE + buffer to ensure single docs always fit
MAX_BATCH_SIZE_KB = (MAX_DOC_SIZE_MB + 5) * 1024 # 30 MB in KB (25 + 5 MB buffer)
MAX_BATCH_COUNT = 200 # Maximum documents per batch

Expand Down Expand Up @@ -83,7 +82,7 @@ def batch_documents(documents: list[Document]) -> list[list[Document]]:
current_batch_size_kb = 0

for doc in documents:
doc_size_kb = doc.file_size_kb or 0
doc_size_kb = doc.file_size_kb

would_exceed_size = (current_batch_size_kb + doc_size_kb) > MAX_BATCH_SIZE_KB
would_exceed_count = len(current_batch) >= MAX_BATCH_COUNT
Expand Down
10 changes: 4 additions & 6 deletions backend/app/tests/services/collections/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,14 +122,12 @@ def test_batch_documents_mixed_size_batching() -> None:
assert len(batches[2]) == 1 # 15 MB total


def test_batch_documents_with_none_file_size() -> None:
"""Test that documents with None file_size are treated as 0 bytes."""
def test_batch_documents_with_none_file_size_raises() -> None:
"""Test that documents with None file_size raise TypeError — sizes must be backfilled before batching."""
docs = create_fake_documents(10, file_size_kb=None)
batches = helpers.batch_documents(docs)

# All files with None/0 size should fit in one batch (under both limits)
assert len(batches) == 1
assert len(batches[0]) == 10
with pytest.raises(TypeError):
helpers.batch_documents(docs)


def test_batch_documents_empty_input() -> None:
Expand Down
Loading