Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 86 additions & 82 deletions sagemaker-core/src/sagemaker/core/resources.py

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions sagemaker-core/src/sagemaker/core/tools/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@

RESOURCE_WITH_LOGS = set(["TrainingJob", "ProcessingJob", "TransformJob"])

DEFAULT_TIMEOUT_MESSAGE = "Increase the timeout and try again."
RESOURCE_TIMEOUT_MESSAGES = {
"TrainingJob": "Your training job is still running. Call .refresh() to check its current status.",
}

CONFIGURABLE_ATTRIBUTE_SUBSTRINGS = [
"kms",
"s3",
Expand Down
3 changes: 3 additions & 0 deletions sagemaker-core/src/sagemaker/core/tools/resources_codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
PYTHON_TYPES_TO_BASIC_JSON_TYPES,
CONFIGURABLE_ATTRIBUTE_SUBSTRINGS,
RESOURCE_WITH_LOGS,
DEFAULT_TIMEOUT_MESSAGE,
RESOURCE_TIMEOUT_MESSAGES,
)
from sagemaker.core.tools.method import Method, MethodType
from sagemaker.core.utils.utils import (
Expand Down Expand Up @@ -1742,6 +1744,7 @@ def generate_wait_method(self, resource_name: str) -> str:
logs_arg_doc=logs_arg_doc,
init_wait_logs=init_wait_logs,
print_wait_logs=print_wait_logs,
timeout_message=RESOURCE_TIMEOUT_MESSAGES.get(resource_name, DEFAULT_TIMEOUT_MESSAGE),
)
return formatted_method

Expand Down
6 changes: 3 additions & 3 deletions sagemaker-core/src/sagemaker/core/tools/templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ def wait(
return

if timeout is not None and time.time() - start_time >= timeout:
raise TimeoutExceededError(resouce_type="{resource_name}", status=current_status)
raise TimeoutExceededError(resource_type="{resource_name}", status=current_status, message="{timeout_message}")
time.sleep(poll)
'''

Expand Down Expand Up @@ -385,7 +385,7 @@ def wait_for_status(
return
{failed_error_block}
if timeout is not None and time.time() - start_time >= timeout:
raise TimeoutExceededError(resouce_type="{resource_name}", status=current_status)
raise TimeoutExceededError(resource_type="{resource_name}", status=current_status)
time.sleep(poll)
'''

Expand Down Expand Up @@ -436,7 +436,7 @@ def wait_for_delete(
{deleted_status_check}

if timeout is not None and time.time() - start_time >= timeout:
raise TimeoutExceededError(resouce_type="{resource_name}", status=current_status)
raise TimeoutExceededError(resource_type="{resource_name}", status=current_status)
except botocore.exceptions.ClientError as e:
error_code = e.response["Error"]["Code"]

Expand Down
7 changes: 4 additions & 3 deletions sagemaker-core/src/sagemaker/core/utils/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,16 +79,17 @@ def __init__(self, resource_type="(Unkown)", reason="(Unkown)"):
class TimeoutExceededError(WaiterError):
"""Raised when a specified timeout is exceeded"""

fmt = "Timeout exceeded while waiting for {resource_type}. Final Resource State: {status}. Increase the timeout and try again."
fmt = "Timeout exceeded while waiting for {resource_type}. Final Resource State: {status}. {message}"

def __init__(self, resource_type="(Unkown)", status="(Unkown)", reason="(Unkown)"):
def __init__(self, resource_type="(Unkown)", status="(Unkown)", reason="(Unkown)", message="Increase the timeout and try again."):
"""Initialize a TimeoutExceededError exception.
Args:
resource_type (str): The type of resource being waited on.
status (str): The final status of the resource.
reason (str): The reason the resource entered a failed state.
message (str): Additional context or guidance for the user.
"""
super().__init__(resource_type=resource_type, status=status, reason=reason)
super().__init__(resource_type=resource_type, status=status, reason=reason, message=message)


### Intelligent Defaults Errors
Expand Down
31 changes: 31 additions & 0 deletions sagemaker-core/tests/unit/utils/test_exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from sagemaker.core.utils.exceptions import TimeoutExceededError


class TestTimeoutExceededError:
def test_default_message(self):
"""Default message should match original behavior."""
err = TimeoutExceededError(resource_type="TrainingJob", status="InProgress")
assert str(err) == (
"Timeout exceeded while waiting for TrainingJob. "
"Final Resource State: InProgress. "
"Increase the timeout and try again."
)

def test_custom_message(self):
"""Custom message should replace the default."""
err = TimeoutExceededError(
resource_type="EvaluationJob",
status="Executing",
message="Your evaluation job is still running. Call .refresh() to check its current status.",
)
assert str(err) == (
"Timeout exceeded while waiting for EvaluationJob. "
"Final Resource State: Executing. "
"Your evaluation job is still running. Call .refresh() to check its current status."
)

def test_default_params(self):
"""No args should use defaults without crashing."""
err = TimeoutExceededError()
assert "(Unkown)" in str(err)
assert "Increase the timeout and try again." in str(err)
15 changes: 9 additions & 6 deletions sagemaker-train/src/sagemaker/train/evaluate/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -1128,8 +1128,9 @@ def wait(
if timeout is not None and time.time() - start_time >= timeout:
from sagemaker.core.utils.exceptions import TimeoutExceededError
raise TimeoutExceededError(
resource_type="PipelineExecution",
status=current_status
resource_type="EvaluationJob",
status=current_status,
message="Your evaluation job is still running. Call .refresh() to check its current status.",
)

time.sleep(poll)
Expand Down Expand Up @@ -1179,8 +1180,9 @@ def wait(
if timeout is not None and time.time() - start_time >= timeout:
from sagemaker.core.utils.exceptions import TimeoutExceededError
raise TimeoutExceededError(
resource_type="PipelineExecution",
status=current_status
resource_type="EvaluationJob",
status=current_status,
message="Your evaluation job is still running. Call .refresh() to check its current status.",
)

time.sleep(poll)
Expand Down Expand Up @@ -1208,8 +1210,9 @@ def wait(
if timeout is not None and elapsed >= timeout:
from sagemaker.core.utils.exceptions import TimeoutExceededError
raise TimeoutExceededError(
resource_type="PipelineExecution",
status=current_status
resource_type="EvaluationJob",
status=current_status,
message="Your evaluation job is still running. Call .refresh() to check its current status.",
)

time.sleep(poll)
Expand Down
4 changes: 3 additions & 1 deletion sagemaker-train/tests/unit/train/evaluate/test_execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -1132,8 +1132,10 @@ def test_wait_timeout_exceeded(self, mock_time):
# Mock time to simulate timeout
mock_time.side_effect = [0, 10, 20, 30, 40, 50, 60] # Exceeds timeout

with pytest.raises(TimeoutExceededError):
with pytest.raises(TimeoutExceededError, match="EvaluationJob") as exc_info:
execution.wait(target_status="Succeeded", poll=1, timeout=5)
assert "still running" in str(exc_info.value)
assert ".refresh()" in str(exc_info.value)

def test_wait_without_pipeline_execution(self):
"""Test wait when no pipeline execution is set."""
Expand Down
Loading