Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 51 additions & 5 deletions src/backend/endpoints/download_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,24 @@
token: str = None
):
validate_token_in_query_param(token)
file_path = os.path.join(DDA_SYSTEM_FOLDER, fileName)

# Sanitize filename to prevent path traversal
safe_filename = os.path.basename(fileName)
if safe_filename != fileName or '..' in fileName:
raise HTTPException(
status_code=400,
detail="Invalid filename"
)

file_path = os.path.join(DDA_SYSTEM_FOLDER, safe_filename)

# Ensure the resolved path is within the allowed directory
if not os.path.abspath(file_path).startswith(os.path.abspath(DDA_SYSTEM_FOLDER)):
raise HTTPException(
status_code=400,
detail="Access denied"
)

if os.path.exists(file_path):
return FileResponse(file_path)

Expand All @@ -103,7 +120,18 @@
capture_details = inference_result_accessor.get_inference_result(db, captureId)
if capture_details is None:
logger.info("Getting image from jsonl file")
jsonl_file = f"/aws_dda/inference-results/{workflowId}/{captureId}.jsonl"
# Sanitize path components to prevent directory traversal
safe_workflow_id = os.path.basename(workflowId)
safe_capture_id = os.path.basename(captureId)
if (safe_workflow_id != workflowId or safe_capture_id != captureId or
'..' in workflowId or '..' in captureId or
os.path.sep in safe_workflow_id or os.path.sep in safe_capture_id):
raise HTTPException(
status_code=400,
detail="Invalid workflow or capture ID"
)

jsonl_file = f"/aws_dda/inference-results/{safe_workflow_id}/{safe_capture_id}.jsonl"
try:
with open(jsonl_file, 'r') as f:
json_str = f.read()
Expand Down Expand Up @@ -184,12 +212,30 @@
# First, POST to /workflows/{workflow_id}/results/export, which writes out the data to a file on disk
# Then, GET from /workflows/{workflow_id}/results/export (here), which loads the data from said file on disk
if captureIdPath:
if not os.path.exists(captureIdPath):
# Sanitize path to prevent directory traversal
safe_path = os.path.basename(captureIdPath)
if safe_path != captureIdPath or '..' in captureIdPath or os.path.sep in safe_path:
raise HTTPException(
status_code=400,
detail="Invalid file path"
)

# Construct full path within allowed directory
full_path = os.path.join(DDA_SYSTEM_FOLDER, safe_path)

# Ensure resolved path stays within DDA_SYSTEM_FOLDER
if not os.path.abspath(full_path).startswith(os.path.abspath(DDA_SYSTEM_FOLDER)):
raise HTTPException(
status_code=400,
detail="Access denied"
)

if not os.path.exists(full_path):

Check failure

Code scanning / CodeQL

Uncontrolled data used in path expression High

This path depends on a
user-provided value
.

Copilot Autofix

AI 4 months ago

The best practice for managing potentially user-controlled file paths is:

  • Normalize (canonicalize) the complete path after joining the user input to the intended base directory. This step removes symlinks, redundant path separators, and traversal segments like ...
  • Then, check after normalization that the resulting path is strictly within the intended parent directory using robust containment logic (not just startswith, which is error-prone for directories like /tmp/foo and /tmp/foobar).

Edits required:

  • In src/backend/endpoints/download_file.py, lines 216-228:
    • Remove or replace os.path.basename and associated manual checks.
    • Use os.path.normpath (and preferably os.path.abspath) after joining the user-supplied path to the base directory.
    • Ensure the containment check is robust: on Unix, this is often done with os.path.commonpath.
    • Use the normalized check before file access.
  • Add any imports needed (os is already imported).

Suggested changeset 1
src/backend/endpoints/download_file.py

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/src/backend/endpoints/download_file.py b/src/backend/endpoints/download_file.py
--- a/src/backend/endpoints/download_file.py
+++ b/src/backend/endpoints/download_file.py
@@ -212,30 +212,22 @@
     # First, POST to /workflows/{workflow_id}/results/export, which writes out the data to a file on disk
     # Then, GET from /workflows/{workflow_id}/results/export (here), which loads the data from said file on disk
     if captureIdPath:
-        # Sanitize path to prevent directory traversal
-        safe_path = os.path.basename(captureIdPath)
-        if safe_path != captureIdPath or '..' in captureIdPath or os.path.sep in safe_path:
+        # Normalize path and ensure it is descendant of the allowed root
+        candidate_path = os.path.normpath(os.path.join(DDA_SYSTEM_FOLDER, captureIdPath))
+        abs_candidate_path = os.path.abspath(candidate_path)
+        dda_system_folder_abs = os.path.abspath(DDA_SYSTEM_FOLDER)
+        # Strict directory containment check using commonpath
+        if os.path.commonpath([abs_candidate_path, dda_system_folder_abs]) != dda_system_folder_abs:
             raise HTTPException(
                 status_code=400,
-                detail="Invalid file path"
+                detail="Invalid or unsafe file path"
             )
-        
-        # Construct full path within allowed directory
-        full_path = os.path.join(DDA_SYSTEM_FOLDER, safe_path)
-        
-        # Ensure resolved path stays within DDA_SYSTEM_FOLDER
-        if not os.path.abspath(full_path).startswith(os.path.abspath(DDA_SYSTEM_FOLDER)):
+        if not os.path.exists(abs_candidate_path):
             raise HTTPException(
-                status_code=400,
-                detail="Access denied"
-            )
-        
-        if not os.path.exists(full_path):
-            raise HTTPException(
                 status_code=HTTP_404_NOT_FOUND,
                 detail="File not found",
             )
-        with open(full_path) as json_file:
+        with open(abs_candidate_path) as json_file:
             inference_result_data_list = json.load(json_file)
     else:
         # Regular case, query the data ourselves
EOF
@@ -212,30 +212,22 @@
# First, POST to /workflows/{workflow_id}/results/export, which writes out the data to a file on disk
# Then, GET from /workflows/{workflow_id}/results/export (here), which loads the data from said file on disk
if captureIdPath:
# Sanitize path to prevent directory traversal
safe_path = os.path.basename(captureIdPath)
if safe_path != captureIdPath or '..' in captureIdPath or os.path.sep in safe_path:
# Normalize path and ensure it is descendant of the allowed root
candidate_path = os.path.normpath(os.path.join(DDA_SYSTEM_FOLDER, captureIdPath))
abs_candidate_path = os.path.abspath(candidate_path)
dda_system_folder_abs = os.path.abspath(DDA_SYSTEM_FOLDER)
# Strict directory containment check using commonpath
if os.path.commonpath([abs_candidate_path, dda_system_folder_abs]) != dda_system_folder_abs:
raise HTTPException(
status_code=400,
detail="Invalid file path"
detail="Invalid or unsafe file path"
)

# Construct full path within allowed directory
full_path = os.path.join(DDA_SYSTEM_FOLDER, safe_path)

# Ensure resolved path stays within DDA_SYSTEM_FOLDER
if not os.path.abspath(full_path).startswith(os.path.abspath(DDA_SYSTEM_FOLDER)):
if not os.path.exists(abs_candidate_path):
raise HTTPException(
status_code=400,
detail="Access denied"
)

if not os.path.exists(full_path):
raise HTTPException(
status_code=HTTP_404_NOT_FOUND,
detail="File not found",
)
with open(full_path) as json_file:
with open(abs_candidate_path) as json_file:
inference_result_data_list = json.load(json_file)
else:
# Regular case, query the data ourselves
Copilot is powered by AI and may make mistakes. Always verify output.
raise HTTPException(
status_code=HTTP_404_NOT_FOUND,
detail=f"The server can't get the capture id file. Error: 'The path {captureIdPath} couldn't be found'. Check the path and try again.",
detail="File not found",
)
with open(captureIdPath) as json_file:
with open(full_path) as json_file:

Check failure

Code scanning / CodeQL

Uncontrolled data used in path expression High

This path depends on a
user-provided value
.

Copilot Autofix

AI 4 months ago

The optimal fix is to eliminate reliance on os.path.basename and instead normalize the provided path by using os.path.normpath(os.path.join(DDA_SYSTEM_FOLDER, captureIdPath)). After normalization, check that the resultant path is strictly within DDA_SYSTEM_FOLDER by verifying it starts with the canonical root directory path. The validation should be based on canonical (absolute, normalized) paths, not just the user-provided string or basename.

Update the logic as follows:

  • Remove safe_path = os.path.basename(captureIdPath) and the related check.
  • Instead, resolve the candidate file path by normalizing with os.path.normpath and then converting to os.path.abspath.
  • Check that the resultant path starts with the canonical root directory.
  • Leave all other logic intact.

All changes should be made in the get_inference_result_data_for_retraining function in src/backend/endpoints/download_file.py.


Suggested changeset 1
src/backend/endpoints/download_file.py

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/src/backend/endpoints/download_file.py b/src/backend/endpoints/download_file.py
--- a/src/backend/endpoints/download_file.py
+++ b/src/backend/endpoints/download_file.py
@@ -212,30 +212,21 @@
     # First, POST to /workflows/{workflow_id}/results/export, which writes out the data to a file on disk
     # Then, GET from /workflows/{workflow_id}/results/export (here), which loads the data from said file on disk
     if captureIdPath:
-        # Sanitize path to prevent directory traversal
-        safe_path = os.path.basename(captureIdPath)
-        if safe_path != captureIdPath or '..' in captureIdPath or os.path.sep in safe_path:
+        # Sanitize and validate path to prevent directory traversal
+        candidate_path = os.path.normpath(os.path.join(DDA_SYSTEM_FOLDER, captureIdPath))
+        root_folder = os.path.abspath(DDA_SYSTEM_FOLDER)
+        resolved_path = os.path.abspath(candidate_path)
+        if not resolved_path.startswith(root_folder + os.sep):
             raise HTTPException(
                 status_code=400,
-                detail="Invalid file path"
+                detail="Invalid or disallowed file path"
             )
-        
-        # Construct full path within allowed directory
-        full_path = os.path.join(DDA_SYSTEM_FOLDER, safe_path)
-        
-        # Ensure resolved path stays within DDA_SYSTEM_FOLDER
-        if not os.path.abspath(full_path).startswith(os.path.abspath(DDA_SYSTEM_FOLDER)):
+        if not os.path.exists(resolved_path):
             raise HTTPException(
-                status_code=400,
-                detail="Access denied"
-            )
-        
-        if not os.path.exists(full_path):
-            raise HTTPException(
                 status_code=HTTP_404_NOT_FOUND,
                 detail="File not found",
             )
-        with open(full_path) as json_file:
+        with open(resolved_path) as json_file:
             inference_result_data_list = json.load(json_file)
     else:
         # Regular case, query the data ourselves
EOF
@@ -212,30 +212,21 @@
# First, POST to /workflows/{workflow_id}/results/export, which writes out the data to a file on disk
# Then, GET from /workflows/{workflow_id}/results/export (here), which loads the data from said file on disk
if captureIdPath:
# Sanitize path to prevent directory traversal
safe_path = os.path.basename(captureIdPath)
if safe_path != captureIdPath or '..' in captureIdPath or os.path.sep in safe_path:
# Sanitize and validate path to prevent directory traversal
candidate_path = os.path.normpath(os.path.join(DDA_SYSTEM_FOLDER, captureIdPath))
root_folder = os.path.abspath(DDA_SYSTEM_FOLDER)
resolved_path = os.path.abspath(candidate_path)
if not resolved_path.startswith(root_folder + os.sep):
raise HTTPException(
status_code=400,
detail="Invalid file path"
detail="Invalid or disallowed file path"
)

# Construct full path within allowed directory
full_path = os.path.join(DDA_SYSTEM_FOLDER, safe_path)

# Ensure resolved path stays within DDA_SYSTEM_FOLDER
if not os.path.abspath(full_path).startswith(os.path.abspath(DDA_SYSTEM_FOLDER)):
if not os.path.exists(resolved_path):
raise HTTPException(
status_code=400,
detail="Access denied"
)

if not os.path.exists(full_path):
raise HTTPException(
status_code=HTTP_404_NOT_FOUND,
detail="File not found",
)
with open(full_path) as json_file:
with open(resolved_path) as json_file:
inference_result_data_list = json.load(json_file)
else:
# Regular case, query the data ourselves
Copilot is powered by AI and may make mistakes. Always verify output.
inference_result_data_list = json.load(json_file)
else:
# Regular case, query the data ourselves
Expand Down
4 changes: 2 additions & 2 deletions src/backend/exceptions/handlers/exception_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ async def unhandled_exception_handler(request: Request, exc: Exception) -> JSONR
exception_name = getattr(exception_type, "__name__", None)

logger.error("Uncaught Exception", exc_info=(exception_type, exception_value, exception_traceback))

return JSONResponse({'message': f"Internal Server Error. Error: '{exception_name}: {exception_value}.'", 'request_id': get_request_id()}, status_code=500)
return JSONResponse({'message': "Internal Server Error. Please contact admin.", 'request_id': get_request_id()}, status_code=500)
#return JSONResponse({'message': f"Internal Server Error. Error: '{exception_name}: {exception_value}.'", 'request_id': get_request_id()}, status_code=500)

def api_exception_logger(request: Request, exc):
url = f"{request.url.path}?{request.query_params}" if request.query_params else request.url.path
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ int main()
// Will be invoked when mqtt topic 'broker-test-subscription' is published too
CHECKHR(broker->Subscribe("test-subscription", [&](IPayload* payload)
{
TraceInfo("Received message: %s", payload->SerializeAsString());
TraceInfo("Received message: %s", payload->SerializeAsString()..c_str());
}));
int32_t test_subscription_cancellation_token = hr;

Expand All @@ -100,7 +100,7 @@ int main()
// when a publish happens with message_id = `test_message`
CHECKHR(broker->Subscribe("test_message", [&](IPayload* payload)
{
TraceInfo("Received locally: %s", payload->SerializeAsString());
TraceInfo("Received locally: %s", payload->SerializeAsString().c_str());
}));
int32_t test_message_cancellation_token = hr;

Expand Down