From 5c02e32856e2aca9723bbe627e73cb69a0fc52dd Mon Sep 17 00:00:00 2001
From: rnichi1 <71671446+rnichi1@users.noreply.github.com>
Date: Sat, 15 Feb 2025 14:28:59 +0100
Subject: [PATCH 1/3] Validate files and get graded-by-ai service from
 dockerhub

redis is currently run separately.
---
 .gitignore                         |   4 +
 src/access_cli_sealuzh/__init__.py |  11 +
 src/access_cli_sealuzh/main.py     | 325 +++++++++++++++++++++++++++--
 src/access_cli_sealuzh/schema.py   |  19 +-
 4 files changed, 343 insertions(+), 16 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0e277e4..4291c20 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,7 @@ __pycache__/
 # C extensions
 *.so
 
+
 # Distribution / packaging
 .Python
 build/
@@ -76,5 +77,8 @@ tags
 # Persistent undo
 [._]*.un~
 
+# Local
+.idea/
+
 # End of https://www.toptal.com/developers/gitignore/api/vim,python
 
diff --git a/src/access_cli_sealuzh/__init__.py b/src/access_cli_sealuzh/__init__.py
index 48b48f2..efdd8f6 100644
--- a/src/access_cli_sealuzh/__init__.py
+++ b/src/access_cli_sealuzh/__init__.py
@@ -49,6 +49,17 @@ def main():
         help = "recurse into nested structures (assignments/tasks) if applicable")
     parser.add_argument('-A', '--auto-detect', action='store_true', default=False,
         help = "attempt to auto-detect what is being validated")
+    parser.add_argument('--llm-api-key', type=str,
+        help = "API key for the LLM service. Create one at the corresponding service provider.")
+    parser.add_argument('--llm-keep-service', action=argparse.BooleanOptionalAction,
+        help = "Keep LLM service running after validation for further grading")
+    parser.add_argument('--llm-only', action=argparse.BooleanOptionalAction,
+        help = "Only run LLM grading (for TAs designing tasks)")
+    parser.add_argument('--assistant-url',
+        default="http://localhost:4000",
+        help = "URL of the assistant service")
+    parser.add_argument('--llm-model', type=str,
+        help = "Model to use for LLM grading")
     args = parser.parse_args()
 
 
diff --git a/src/access_cli_sealuzh/main.py b/src/access_cli_sealuzh/main.py
index 86b1c66..1ad9433 100755
--- a/src/access_cli_sealuzh/main.py
+++ b/src/access_cli_sealuzh/main.py
@@ -11,6 +11,8 @@
 from access_cli_sealuzh.logger import Logger
 from cerberus import Validator
 from access_cli_sealuzh.schema import *
+import requests
+import time
 
 def autodetect(args):
     # if a directory has been specified, assume that's what we're validating
@@ -222,21 +224,61 @@ def validate_task(self, course_dir=None, assignment_dir=None, task_dir=None):
             self.execute_command(task, config, "test_command", self.args.test)
         if self.args.test_solution:
             self.execute_command(task, config, "test_command", 0, solve_command=self.args.solve_command)
-        if self.args.grade_template:
-            self.execute_grade_command(task, config, 0)
-        if self.args.grade_solution:
-            self.execute_grade_command(task, config, config["max_points"], self.args.solve_command)
-        # TODO GBAI:
-        # * OPTIONALLY: use AI grading on sample solution (use self.args.solve_command like above)
+
+        # Skip non-AI validation if --llm-only is set
+        if not self.args.llm_only:
+            if self.args.grade_template:
+                self.execute_grade_command(task, config, 0)
+            if self.args.grade_solution:
+                self.execute_grade_command(task, config, config["max_points"], self.args.solve_command)
+
+
+        # Validate LLM configuration if present
+        if "llm" in config:
+            # Check required files exist
+            for file_key in ["submission", "rubrics", "examples", "solution"]:
+                if file_key in config["llm"]:
+                    file_path = config["llm"][file_key]
+                    if not os.path.isfile(os.path.join(task, file_path)):
+                        self.logger.error(f"{path} llm references non-existing {file_key} file: {file_path}")
+
+            # Check optional files exist if specified
+            for file_key in ["post", "pre"]:
+                if file_key in config["llm"]:
+                    file_path = config["llm"][file_key]
+                    if not os.path.isfile(os.path.join(task, file_path)):
+                        self.logger.error(f"{path} llm references non-existing {file_key} file: {file_path}")
+
+            # Validate that referenced files are in the correct context
+            for file_path in [config["llm"].get(k) for k in ["rubrics", "examples", "solution", "post", "pre"] if k in config["llm"]]:
+                if file_path in config["files"]["editable"]:
+                    self.logger.error(f"{path} llm file {file_path} marked as editable")
+                if file_path in config["files"]["visible"]:
+                    self.logger.error(f"{path} llm file {file_path} marked as visible")
+
+        # Run AI validation if --llm-only is set
+        if self.args.llm_only and "llm" in config:
+            # Check if we should override model from CLI
+            model_override = getattr(self.args, 'llm_model', None)  # Safely get llm_model
+            if model_override:  # Override model from CLI
+                config["llm"]["model"] = model_override
+                
+            if self.args.grade_template:
+                self.execute_ai_grading(task, config, 0)
+            if self.args.grade_solution:
+                self.execute_ai_grading(task, config, config["llm"]["max_points"], test_solution=True)
 
         # TODO GBAI:
-        # * def execute_ai_grading(...?
-        # * first check if the AI service is running, if not, pull and start
-        # * self.logger.error if something goes wrong
-        # * if there are no errors, that means the validation passes
-        # * validate that the template receives 0 points
-        # * validate that the sample solution receives expected llm.max_points
-        # * use self.print to show AI results (points and feedback)
+
+                # TODO GBAI:
+                # * def execute_ai_grading(...?
+                # * first check if the AI service is running, if not, pull and start
+                # * self.logger.error if something goes wrong
+                # * if there are no errors, that means the validation passes
+                # * validate that the template receives 0 points
+                # * validate that the sample solution receives expected llm.max_points
+            # * use self.print to show AI results (points and feedback)
+
 
     def execute_grade_command(self, task, config, expected_points, solve_command=None):
         grade_results = self.execute_command(task, config, "grade_command", solve_command=solve_command)
@@ -348,10 +390,265 @@ def print(self, string, verbose=False):
         if self.args.verbose or verbose:
             print(string)
 
+    def read_rubrics_from_toml(self, task_path, rubrics_file):
+        """Read and parse rubrics TOML file into JSON string."""
+        abs_path = os.path.join(task_path, rubrics_file)
+        if not os.path.exists(abs_path):
+            return None
+            
+        with open(abs_path, 'rb') as f:
+            rubrics_config = tomli.load(f)
+            
+        rubrics_list = []
+        if "rubrics" in rubrics_config:
+            for rubric in rubrics_config["rubrics"]:
+                rubrics_list.append({
+                    "id": rubric["id"],
+                    "title": rubric["title"],
+                    "points": float(rubric["points"])
+                })
+                
+        return rubrics_list
+
+    def read_examples_from_toml(self, task_path, examples_file):
+        """Read and parse examples TOML file into JSON string."""
+        abs_path = os.path.join(task_path, examples_file)
+        if not os.path.exists(abs_path):
+            return None
+            
+        with open(abs_path, 'rb') as f:
+            examples_config = tomli.load(f)
+            
+        examples_list = []
+        if "examples" in examples_config:
+            for example in examples_config["examples"]:
+                examples_list.append({
+                    "answer": example["answer"],
+                    "points": str(example["points"])
+                })
+                
+        return examples_list
+
+    def start_llm_service(self):
+        """Start the LLM service container."""
+        try:
+            # Pull the images
+            if self.args.verbose:
+                self.logger.info("Pulling required images")
+            subprocess.run(["docker", "pull", "sealuzh/graded-by-ai"], check=True, capture_output=True)
+            subprocess.run(["docker", "pull", "redis:latest"], check=True, capture_output=True)
+            
+            # Create network if it doesn't exist
+            subprocess.run(["docker", "network", "create", "llm-network"], capture_output=True)
+            
+            # Start Redis container
+            subprocess.run([
+                "docker", "run", "--rm", "-d",
+                "--network", "llm-network",
+                "--name", "redis",
+                "redis:latest"
+            ], check=True, capture_output=True)
+            
+            # Create temporary file for container ID and ensure it doesn't exist
+            self.cid_file = tempfile.NamedTemporaryFile(delete=False)
+            if os.path.exists(self.cid_file.name):
+                os.unlink(self.cid_file.name)
+            
+            # Start the LLM service container
+            instruction = [
+                "docker", "run", "--rm", "-d",
+                "--cidfile", self.cid_file.name,
+                "--network", "llm-network",
+                "-p", "4000:4000",
+                "-e", "REDIS_HOST=redis",
+                "sealuzh/graded-by-ai"
+            ]
+            
+            subprocess.run(instruction, check=True, capture_output=True)
+            
+            # Wait for service to be ready
+            start_time = time.time()
+            while time.time() - start_time < 30:
+                with open(self.cid_file.name) as f:
+                    container_id = f.read().strip()
+                result = subprocess.run(["docker", "logs", container_id], capture_output=True, text=True)
+                if "Nest application successfully started" in result.stdout:
+                    if self.args.verbose:
+                        self.logger.info("LLM service is ready")
+                    return
+                time.sleep(1)
+                    
+            raise Exception("LLM service failed to start within 30 seconds")
+            
+        except Exception as e:
+            self.stop_llm_service()
+            raise e
+
+    def stop_llm_service(self):
+        """Stop the LLM service container and cleanup."""
+        if not hasattr(self, 'cid_file') or self.cid_file is None:
+            return
+            
+        if not self.args.llm_keep_service:
+            try:
+                # Stop LLM container
+                with open(self.cid_file.name) as f:
+                    container_id = f.read().strip()
+                if container_id:
+                    subprocess.run(["docker", "stop", container_id], check=True)
+                
+                # Stop Redis container
+                subprocess.run(["docker", "stop", "redis"], check=True)
+                
+                # Remove network
+                subprocess.run(["docker", "network", "rm", "llm-network"], check=True)
+                
+            except Exception as e:
+                self.logger.error(f"Failed to stop containers: {str(e)}")
+            finally:
+                if os.path.exists(self.cid_file.name):
+                    os.unlink(self.cid_file.name)
+                self.cid_file = None
+
+    def execute_ai_grading(self, task, config, expected_points=None, test_solution=None):
+        if "llm" not in config:
+            return
+            
+        try:
+            self.start_llm_service()
+            
+            llm_config = config["llm"]
+            
+            # Read submission or solution content
+            if test_solution:  # If validating solution
+                submission_content = self.read_text_file(task, llm_config["solution"])
+            else:  # If validating submission
+                submission_content = self.read_text_file(task, llm_config["submission"])
+            
+            if not submission_content:
+                self.logger.error(f"Could not read {'solution' if test_solution else 'submission'} file")
+                return
+
+            # Read and parse rubrics and examples
+            rubrics_content = self.read_rubrics_from_toml(task, llm_config["rubrics"])
+            examples_content = self.read_examples_from_toml(task, llm_config["examples"])
+            solution_content = self.read_text_file(task, llm_config["solution"])
+            
+            # Read optional files
+            pre_content = self.read_text_file(task, llm_config.get("pre")) if "pre" in llm_config else None
+            post_content = self.read_text_file(task, llm_config.get("post")) if "post" in llm_config else None
+            prompt_content = self.read_text_file(task, llm_config.get("prompt")) if "prompt" in llm_config else None
+
+            # Read instruction file
+            instruction_content = self.read_text_file(task, config["information"]["en"]["instructions_file"])
+
+            # Prepare evaluation request
+            model_family = llm_config.get("model_family", "claude")
+            default_model = "claude-3-5-sonnet-latest" if model_family == "claude" else "gpt-4o-mini"
+            model = llm_config.get("model", default_model)
+
+            assistant_request = {
+                "question": instruction_content or "No instructions provided",
+                "answer": submission_content,
+                "llmType": model_family,
+                "chainOfThought": llm_config.get("cot", False),
+                "votingCount": llm_config.get("voting", 1),
+                "rubrics": rubrics_content if rubrics_content else [],
+                "prompt": prompt_content,
+                "prePrompt": pre_content,
+                "postPrompt": post_content,
+                "temperature": llm_config.get("temperature"),
+                "fewShotExamples": examples_content if examples_content else [],
+                "maxPoints": llm_config["max_points"],
+                "modelSolution": solution_content,
+                "llmModel": model,
+                "apiKey": self.args.llm_api_key
+            }
+
+            # Initial request to start evaluation
+            response = requests.post(
+                f"{self.args.assistant_url}/evaluate",
+                json=assistant_request
+            )
+            response.raise_for_status()
+            task_id = response.json()["jobId"]
+
+            # Polling loop
+            max_attempts = 20
+            delay_seconds = 2
+            
+            for _ in range(max_attempts):
+                status_response = requests.get(
+                    f"{self.args.assistant_url}/evaluate/{task_id}"
+                )
+                status_response.raise_for_status()
+                status_data = status_response.json()
+                print("Polling: Current status of AI grading: ", status_data["status"])
+
+                if status_data["status"] == "completed":
+                    result = status_data["result"]
+                    
+                    # Print results
+                    self.print("╭──AI Grading Results──╮")
+                    self.print(f"│ Points: {result['points']}/{config['llm']['max_points']}")
+                    self.print("│ Feedback:")
+                    for line in result['feedback'].split('\n'):
+                        self.print(f"│ {line}")
+                    if result.get('hint'):
+                        self.print("│ Hint:")
+                        self.print(f"│ {result['hint']}")
+                    self.print("╰─────────────────────╯")
+
+                    # Validate points if expected_points is set
+                    if expected_points is not None and result['points'] != expected_points:
+                        self.logger.error(
+                            f"AI grading: got {result['points']} points but expected {expected_points}"
+                        )
+                    
+                    return result
+
+                elif status_data["status"] == "not_found":
+                    self.logger.error(f"AI grading task not found: {task_id}")
+                    return None
+                    
+                time.sleep(delay_seconds)
+            
+            self.logger.error("AI grading timed out")
+            return None
+
+        except requests.RequestException as e:
+            self.logger.error(f"AI grading failed: {str(e)}")
+            return None
+        finally:
+            self.stop_llm_service()
+
+    def read_text_file(self, task_path, file_path):
+        """Read a text file.
+        
+        Args:
+            task_path: Base path to the task directory
+            file_path: Relative path to the file
+            
+        Returns:
+            str: Content of the file or None if file doesn't exist
+        """
+        if not file_path:
+            return None
+            
+        abs_path = os.path.join(task_path, file_path)
+        if not os.path.exists(abs_path):
+            return None
+            
+        try:
+            with open(abs_path, 'r', encoding='utf-8') as f:
+                return f.read()
+        except Exception as e:
+            self.logger.error(f"Failed to read file {file_path}: {str(e)}")
+            return None
+
     def run(self):
         match self.args.level:
             case "course": self.validate_course(self.args.directory)
             case "assignment": self.validate_assignment(assignment_dir = self.args.directory)
             case "task": self.validate_task(task_dir = self.args.directory)
         return self.logger
-
diff --git a/src/access_cli_sealuzh/schema.py b/src/access_cli_sealuzh/schema.py
index 2013387..fb930ac 100644
--- a/src/access_cli_sealuzh/schema.py
+++ b/src/access_cli_sealuzh/schema.py
@@ -77,7 +77,7 @@
 # - that none of the grading or solution files are editable or visible
 # - that editable files are also visible
 # - OPTIONALLY: that the run, test and grade commands execute correctly
-task_schema = { # TODO GBAI: add additional fields
+task_schema = {
     "slug":         {'required': True, 'type': 'string'},
     "authors":      {'required': False, 'type': 'list',
                      'schema': {'type': 'string'}},
@@ -102,6 +102,21 @@
                                      'schema': {'type': 'string'}},
                      "persist":     {'required': False, 'type': 'list',
                                      'schema': {'type': 'string'}}
-                    }}
+                    }},
+    "llm": {'required': False, 'type': 'dict', 'schema': {
+        'submission': {'required': True, 'type': 'string'},
+        'rubrics': {'required': False, 'type': 'string'},
+        'examples': {'required': False, 'type': 'string'},
+        'solution': {'required': False, 'type': 'string'},
+        'cot': {'required': False, 'type': 'boolean'},
+        'voting': {'required': False, 'type': 'integer'},
+        'post': {'required': False, 'type': 'string'},
+        'pre': {'required': False, 'type': 'string'},
+        'prompt': {'required': False, 'type': 'string'},
+        'temperature': {'required': False, 'type': 'float'},
+        'model': {'required': False, 'type': 'string'},
+        'max_points': {'required': False, 'type': 'float'},
+        'model_family': {'required': False, 'type': 'string'}
+    }},
 }
 

From 52fe50bb3ac4e0ff0e24711c789507af637fcdac Mon Sep 17 00:00:00 2001
From: rnichi1 <71671446+rnichi1@users.noreply.github.com>
Date: Sat, 15 Feb 2025 23:39:37 +0100
Subject: [PATCH 2/3] finished functionality + tests

---
 src/access_cli_sealuzh/__init__.py            |  10 +-
 src/access_cli_sealuzh/main.py                | 126 +++++++++---------
 src/access_cli_sealuzh/schema.py              |   1 +
 .../resources/llm/complete-config/config.toml |  48 +++++++
 .../llm/complete-config/grading/examples.toml |   7 +
 .../llm/complete-config/grading/post.md       |   1 +
 .../llm/complete-config/grading/tests.py      |  37 +++++
 .../llm/complete-config/instructions_en.md    |  11 ++
 .../llm/complete-config/rubrics/rubrics.toml  |   9 ++
 .../complete-config/solution/explanation.md   |  16 +++
 .../llm/complete-config/solution/script.py    |   3 +
 .../llm/complete-config/task/explanation.md   |   1 +
 .../llm/complete-config/task/script.py        |   7 +
 .../llm/invalid-model-family/config.toml      |  48 +++++++
 .../grading/examples.toml                     |   7 +
 .../llm/invalid-model-family/grading/post.md  |   1 +
 .../llm/invalid-model-family/grading/tests.py |  37 +++++
 .../invalid-model-family/instructions_en.md   |  11 ++
 .../invalid-model-family/rubrics/rubrics.toml |   9 ++
 .../solution/explanation.md                   |  16 +++
 .../invalid-model-family/solution/script.py   |   3 +
 .../invalid-model-family/task/explanation.md  |   1 +
 .../llm/invalid-model-family/task/script.py   |   7 +
 .../llm/invalid-permissions/config.toml       |  49 +++++++
 .../invalid-permissions/grading/examples.toml |   7 +
 .../llm/invalid-permissions/grading/post.md   |   1 +
 .../llm/invalid-permissions/grading/tests.py  |  37 +++++
 .../invalid-permissions/instructions_en.md    |  11 ++
 .../invalid-permissions/rubrics/rubrics.toml  |   9 ++
 .../solution/explanation.md                   |  16 +++
 .../invalid-permissions/solution/script.py    |   3 +
 .../invalid-permissions/task/explanation.md   |   1 +
 .../llm/invalid-permissions/task/script.py    |   7 +
 .../resources/llm/minimal-config/config.toml  |  36 +++++
 .../llm/minimal-config/instructions_en.md     |  11 ++
 .../minimal-config/solution/explanation.md    |  16 +++
 .../llm/minimal-config/solution/script.py     |   3 +
 .../llm/minimal-config/task/explanation.md    |   1 +
 .../llm/minimal-config/task/script.py         |   7 +
 .../resources/llm/missing-files/config.toml   |  37 +++++
 .../llm/missing-files/instructions_en.md      |  11 ++
 .../llm/missing-files/task/explanation.md     |   1 +
 .../llm/missing-files/task/script.py          |   7 +
 src/tests/test_llm_execution.py               |  83 ++++++++++++
 44 files changed, 704 insertions(+), 67 deletions(-)
 create mode 100644 src/tests/resources/llm/complete-config/config.toml
 create mode 100644 src/tests/resources/llm/complete-config/grading/examples.toml
 create mode 100644 src/tests/resources/llm/complete-config/grading/post.md
 create mode 100644 src/tests/resources/llm/complete-config/grading/tests.py
 create mode 100644 src/tests/resources/llm/complete-config/instructions_en.md
 create mode 100644 src/tests/resources/llm/complete-config/rubrics/rubrics.toml
 create mode 100644 src/tests/resources/llm/complete-config/solution/explanation.md
 create mode 100644 src/tests/resources/llm/complete-config/solution/script.py
 create mode 100644 src/tests/resources/llm/complete-config/task/explanation.md
 create mode 100644 src/tests/resources/llm/complete-config/task/script.py
 create mode 100644 src/tests/resources/llm/invalid-model-family/config.toml
 create mode 100644 src/tests/resources/llm/invalid-model-family/grading/examples.toml
 create mode 100644 src/tests/resources/llm/invalid-model-family/grading/post.md
 create mode 100644 src/tests/resources/llm/invalid-model-family/grading/tests.py
 create mode 100644 src/tests/resources/llm/invalid-model-family/instructions_en.md
 create mode 100644 src/tests/resources/llm/invalid-model-family/rubrics/rubrics.toml
 create mode 100644 src/tests/resources/llm/invalid-model-family/solution/explanation.md
 create mode 100644 src/tests/resources/llm/invalid-model-family/solution/script.py
 create mode 100644 src/tests/resources/llm/invalid-model-family/task/explanation.md
 create mode 100644 src/tests/resources/llm/invalid-model-family/task/script.py
 create mode 100644 src/tests/resources/llm/invalid-permissions/config.toml
 create mode 100644 src/tests/resources/llm/invalid-permissions/grading/examples.toml
 create mode 100644 src/tests/resources/llm/invalid-permissions/grading/post.md
 create mode 100644 src/tests/resources/llm/invalid-permissions/grading/tests.py
 create mode 100644 src/tests/resources/llm/invalid-permissions/instructions_en.md
 create mode 100644 src/tests/resources/llm/invalid-permissions/rubrics/rubrics.toml
 create mode 100644 src/tests/resources/llm/invalid-permissions/solution/explanation.md
 create mode 100644 src/tests/resources/llm/invalid-permissions/solution/script.py
 create mode 100644 src/tests/resources/llm/invalid-permissions/task/explanation.md
 create mode 100644 src/tests/resources/llm/invalid-permissions/task/script.py
 create mode 100644 src/tests/resources/llm/minimal-config/config.toml
 create mode 100644 src/tests/resources/llm/minimal-config/instructions_en.md
 create mode 100644 src/tests/resources/llm/minimal-config/solution/explanation.md
 create mode 100644 src/tests/resources/llm/minimal-config/solution/script.py
 create mode 100644 src/tests/resources/llm/minimal-config/task/explanation.md
 create mode 100644 src/tests/resources/llm/minimal-config/task/script.py
 create mode 100644 src/tests/resources/llm/missing-files/config.toml
 create mode 100644 src/tests/resources/llm/missing-files/instructions_en.md
 create mode 100644 src/tests/resources/llm/missing-files/task/explanation.md
 create mode 100644 src/tests/resources/llm/missing-files/task/script.py
 create mode 100644 src/tests/test_llm_execution.py

diff --git a/src/access_cli_sealuzh/__init__.py b/src/access_cli_sealuzh/__init__.py
index efdd8f6..bf06031 100644
--- a/src/access_cli_sealuzh/__init__.py
+++ b/src/access_cli_sealuzh/__init__.py
@@ -49,16 +49,16 @@ def main():
         help = "recurse into nested structures (assignments/tasks) if applicable")
     parser.add_argument('-A', '--auto-detect', action='store_true', default=False,
         help = "attempt to auto-detect what is being validated")
-    parser.add_argument('--llm-api-key', type=str,
+    parser.add_argument('-k', '--llm-api-key', type=str,
         help = "API key for the LLM service. Create one at the corresponding service provider.")
-    parser.add_argument('--llm-keep-service', action=argparse.BooleanOptionalAction,
+    parser.add_argument('-K', '--llm-keep-service', action=argparse.BooleanOptionalAction,
         help = "Keep LLM service running after validation for further grading")
-    parser.add_argument('--llm-only', action=argparse.BooleanOptionalAction,
+    parser.add_argument('-L', '--llm-only', action=argparse.BooleanOptionalAction,
         help = "Only run LLM grading (for TAs designing tasks)")
-    parser.add_argument('--assistant-url',
+    parser.add_argument('-U', '--assistant-url',
         default="http://localhost:4000",
         help = "URL of the assistant service")
-    parser.add_argument('--llm-model', type=str,
+    parser.add_argument('-M', '--llm-model', type=str,
         help = "Model to use for LLM grading")
     args = parser.parse_args()
 
diff --git a/src/access_cli_sealuzh/main.py b/src/access_cli_sealuzh/main.py
index 1ad9433..4bb1c80 100755
--- a/src/access_cli_sealuzh/main.py
+++ b/src/access_cli_sealuzh/main.py
@@ -171,10 +171,52 @@ def validate_task(self, course_dir=None, assignment_dir=None, task_dir=None):
             task = os.path.join(assignment_dir, task_dir)
         else:
             task = os.path.join(course_dir, assignment_dir, task_dir)
+
         self.print(f" > Validating task {task}", True)
+
         self.logger.set_subject(task)
         try: path, config = self.read_directory_config(task)
         except FileNotFoundError: return
+
+        # Validate LLM configuration if present
+        if "llm" in config:
+            # Check required files exist
+            if "submission" in config["llm"]:
+                file_path = config["llm"]["submission"]
+                if not os.path.isfile(os.path.join(task, file_path)):
+                    self.logger.error(f"{path} llm references non-existing submission file: {file_path}")
+
+            # Check optional files exist if specified
+            for file_key in ["post", "pre", "rubrics", "examples", "solution", "prompt"]:
+                if file_key in config["llm"]:
+                    file_path = config["llm"][file_key]
+                    if not os.path.isfile(os.path.join(task, file_path)):
+                        self.logger.error(f"{path} llm references non-existing {file_key} file: {file_path}")
+
+            # Validate that referenced files are in the correct context
+            for file_path in [config["llm"].get(k) for k in ["rubrics", "examples", "solution", "post", "pre", "prompt"] if k in config["llm"]]:
+                if file_path in config["files"]["editable"]:
+                    self.logger.error(f"{path} llm file {file_path} marked as editable")
+                if file_path in config["files"]["visible"]:
+                    self.logger.error(f"{path} llm file {file_path} marked as visible")
+
+        # Run AI validation if --llm-only is set
+        if self.args.llm_only:
+            if "llm" in config:
+                # Check if we should override model from CLI
+                model_override = getattr(self.args, 'llm_model', None)  # Safely get llm_model
+                if model_override:  # Override model from CLI
+                    config["llm"]["model"] = model_override
+                    
+                if self.args.grade_template:
+                    self.execute_ai_grading(task, config, 0)
+                if self.args.grade_solution:
+                    self.execute_ai_grading(task, config, config["llm"]["max_points"], self.args.grade_solution)
+            else:
+                self.print(f"{path} llm not specified in config")
+
+            return
+   
         # schema validation
         if not self.v.validate(config, task_schema):
             self.logger.error(f"{path} schema errors:\n\t{self.pp.pformat(self.v.errors)}")
@@ -225,39 +267,13 @@ def validate_task(self, course_dir=None, assignment_dir=None, task_dir=None):
         if self.args.test_solution:
             self.execute_command(task, config, "test_command", 0, solve_command=self.args.solve_command)
 
-        # Skip non-AI validation if --llm-only is set
-        if not self.args.llm_only:
-            if self.args.grade_template:
-                self.execute_grade_command(task, config, 0)
-            if self.args.grade_solution:
-                self.execute_grade_command(task, config, config["max_points"], self.args.solve_command)
-
-
-        # Validate LLM configuration if present
-        if "llm" in config:
-            # Check required files exist
-            for file_key in ["submission", "rubrics", "examples", "solution"]:
-                if file_key in config["llm"]:
-                    file_path = config["llm"][file_key]
-                    if not os.path.isfile(os.path.join(task, file_path)):
-                        self.logger.error(f"{path} llm references non-existing {file_key} file: {file_path}")
-
-            # Check optional files exist if specified
-            for file_key in ["post", "pre"]:
-                if file_key in config["llm"]:
-                    file_path = config["llm"][file_key]
-                    if not os.path.isfile(os.path.join(task, file_path)):
-                        self.logger.error(f"{path} llm references non-existing {file_key} file: {file_path}")
-
-            # Validate that referenced files are in the correct context
-            for file_path in [config["llm"].get(k) for k in ["rubrics", "examples", "solution", "post", "pre"] if k in config["llm"]]:
-                if file_path in config["files"]["editable"]:
-                    self.logger.error(f"{path} llm file {file_path} marked as editable")
-                if file_path in config["files"]["visible"]:
-                    self.logger.error(f"{path} llm file {file_path} marked as visible")
+        if self.args.grade_template:
+            self.execute_grade_command(task, config, 0)
+        if self.args.grade_solution:
+            self.execute_grade_command(task, config, config["max_points"], self.args.solve_command)
 
         # Run AI validation if --llm-only is set
-        if self.args.llm_only and "llm" in config:
+        if "llm" in config:
             # Check if we should override model from CLI
             model_override = getattr(self.args, 'llm_model', None)  # Safely get llm_model
             if model_override:  # Override model from CLI
@@ -268,17 +284,6 @@ def validate_task(self, course_dir=None, assignment_dir=None, task_dir=None):
             if self.args.grade_solution:
                 self.execute_ai_grading(task, config, config["llm"]["max_points"], test_solution=True)
 
-        # TODO GBAI:
-
-                # TODO GBAI:
-                # * def execute_ai_grading(...?
-                # * first check if the AI service is running, if not, pull and start
-                # * self.logger.error if something goes wrong
-                # * if there are no errors, that means the validation passes
-                # * validate that the template receives 0 points
-                # * validate that the sample solution receives expected llm.max_points
-            # * use self.print to show AI results (points and feedback)
-
 
     def execute_grade_command(self, task, config, expected_points, solve_command=None):
         grade_results = self.execute_command(task, config, "grade_command", solve_command=solve_command)
@@ -529,12 +534,10 @@ def execute_ai_grading(self, task, config, expected_points=None, test_solution=N
                 self.logger.error(f"Could not read {'solution' if test_solution else 'submission'} file")
                 return
 
-            # Read and parse rubrics and examples
-            rubrics_content = self.read_rubrics_from_toml(task, llm_config["rubrics"])
-            examples_content = self.read_examples_from_toml(task, llm_config["examples"])
-            solution_content = self.read_text_file(task, llm_config["solution"])
-            
             # Read optional files
+            rubrics_content = self.read_rubrics_from_toml(task, llm_config.get("rubrics")) if "rubrics" in llm_config else []
+            examples_content = self.read_examples_from_toml(task, llm_config.get("examples")) if "examples" in llm_config else []
+            solution_content = self.read_text_file(task, llm_config.get("solution")) if "solution" in llm_config else None
             pre_content = self.read_text_file(task, llm_config.get("pre")) if "pre" in llm_config else None
             post_content = self.read_text_file(task, llm_config.get("post")) if "post" in llm_config else None
             prompt_content = self.read_text_file(task, llm_config.get("prompt")) if "prompt" in llm_config else None
@@ -546,9 +549,10 @@ def execute_ai_grading(self, task, config, expected_points=None, test_solution=N
             model_family = llm_config.get("model_family", "claude")
             default_model = "claude-3-5-sonnet-latest" if model_family == "claude" else "gpt-4o-mini"
             model = llm_config.get("model", default_model)
-
+            max_points = llm_config.get("max_points", config.get("max_points", 1))
+            
             assistant_request = {
-                "question": instruction_content or "No instructions provided",
+                "question": instruction_content,
                 "answer": submission_content,
                 "llmType": model_family,
                 "chainOfThought": llm_config.get("cot", False),
@@ -557,9 +561,9 @@ def execute_ai_grading(self, task, config, expected_points=None, test_solution=N
                 "prompt": prompt_content,
                 "prePrompt": pre_content,
                 "postPrompt": post_content,
-                "temperature": llm_config.get("temperature"),
+                "temperature": llm_config.get("temperature", 0.2),
                 "fewShotExamples": examples_content if examples_content else [],
-                "maxPoints": llm_config["max_points"],
+                "maxPoints": max_points,
                 "modelSolution": solution_content,
                 "llmModel": model,
                 "apiKey": self.args.llm_api_key
@@ -583,14 +587,15 @@ def execute_ai_grading(self, task, config, expected_points=None, test_solution=N
                 )
                 status_response.raise_for_status()
                 status_data = status_response.json()
-                print("Polling: Current status of AI grading: ", status_data["status"])
+                self.print("Polling: Current status of LLM processing: ", status_data["status"])
 
+                # Task completed
                 if status_data["status"] == "completed":
                     result = status_data["result"]
                     
                     # Print results
                     self.print("╭──AI Grading Results──╮")
-                    self.print(f"│ Points: {result['points']}/{config['llm']['max_points']}")
+                    self.print(f"│ Points: {result['points']}/{max_points}")
                     self.print("│ Feedback:")
                     for line in result['feedback'].split('\n'):
                         self.print(f"│ {line}")
@@ -604,21 +609,18 @@ def execute_ai_grading(self, task, config, expected_points=None, test_solution=N
                         self.logger.error(
                             f"AI grading: got {result['points']} points but expected {expected_points}"
                         )
-                    
-                    return result
-
+                    return
+                # Task not found
                 elif status_data["status"] == "not_found":
                     self.logger.error(f"AI grading task not found: {task_id}")
-                    return None
-                    
+                    return
+
                 time.sleep(delay_seconds)
             
-            self.logger.error("AI grading timed out")
-            return None
+            self.logger.error("LLM processing timed out")
 
         except requests.RequestException as e:
-            self.logger.error(f"AI grading failed: {str(e)}")
-            return None
+            self.logger.error(f"LLM processing failed: {str(e)}")
         finally:
             self.stop_llm_service()
 
diff --git a/src/access_cli_sealuzh/schema.py b/src/access_cli_sealuzh/schema.py
index fb930ac..37030ec 100644
--- a/src/access_cli_sealuzh/schema.py
+++ b/src/access_cli_sealuzh/schema.py
@@ -76,6 +76,7 @@
 # - if each file in files actually exists
 # - that none of the grading or solution files are editable or visible
 # - that editable files are also visible
+# - that if you use llm evaluation, all defined files in the config are also present as files (submission, rubrics, examples, solution)
 # - OPTIONALLY: that the run, test and grade commands execute correctly
 task_schema = {
     "slug":         {'required': True, 'type': 'string'},
diff --git a/src/tests/resources/llm/complete-config/config.toml b/src/tests/resources/llm/complete-config/config.toml
new file mode 100644
index 0000000..110670e
--- /dev/null
+++ b/src/tests/resources/llm/complete-config/config.toml
@@ -0,0 +1,48 @@
+slug = "string_manipulation"
+
+
+max_attempts = 6
+refill = 43200 # 12 hours
+max_points = 2
+
+[information.en]
+title = "String Manipulation in Python"
+instructions_file = "instructions_en.md"
+
+[evaluator]
+docker_image = "python:latest"
+run_command = "python -m task.script"
+test_command = "python -m unittest discover -v task"
+grade_command = "python -m grading.tests"
+
+[llm]
+submission = "task/explanation.md"
+rubrics = 'rubrics/rubrics.toml' # This file contains the rubrics for the task to guide the model
+examples = 'grading/examples.toml' # This file contains examples of the task for the model to learn from
+solution = 'solution/explanation.md' # This is the solution file used to guide the model
+cot = true # This will add "think step by step" to the context prompt in order to encourage the model to think step by step
+voting = 3 # This allows for the results to be evaluated 3 times, and the most common result is chosen. This setting will increase the time it takes to grade the task!
+post = "grading/post.md" # Adds further instruction at the end of the context prompt
+# pre = "grading/pre.md" # Adds further instruction in front of the context prompt
+# prompt = "grading/prompt.md" # replaces the context prompt with the content of the file. This is only for advanced usages!
+temperature = 0.2 # Decides the randomness of the gpt model
+model_family = "claude" # gpt or claude
+model = "claude-3-5-sonnet-latest" # gpt-4o, gpt-4o-mini, claude-3-5-sonnet-latest, etc.
+max_points = 1 # Max points for the sub-task that is passed to the model
+
+[files]
+visible = [
+  "task/script.py",
+  "task/explanation.md",
+]
+editable = [
+  "task/script.py",
+  "task/explanation.md",
+]
+grading = [
+  "grading/tests.py",
+]
+solution = [
+  "solution/script.py",
+  "solution/explanation.md",
+]
diff --git a/src/tests/resources/llm/complete-config/grading/examples.toml b/src/tests/resources/llm/complete-config/grading/examples.toml
new file mode 100644
index 0000000..ba40170
--- /dev/null
+++ b/src/tests/resources/llm/complete-config/grading/examples.toml
@@ -0,0 +1,7 @@
+[[examples]]
+answer = "Reversing word order and reversing characters both have O(n) complexity, but character reversal requires more operations per word, making it slightly less efficient in practice."
+points = { time_complexity_mentioned = 0.5, asymptotically_equivalent = 0.5 }
+
+[[examples]]
+answer = "Both operations have O(n) complexity because they process each character in the string."
+points = { time_complexity_mentioned = 0.5, asymptotically_equivalent = 0 }
diff --git a/src/tests/resources/llm/complete-config/grading/post.md b/src/tests/resources/llm/complete-config/grading/post.md
new file mode 100644
index 0000000..8a624ec
--- /dev/null
+++ b/src/tests/resources/llm/complete-config/grading/post.md
@@ -0,0 +1 @@
+The student answer will not contain any code. This is expected, since you only need to grade the explanation according to the rubrics!
\ No newline at end of file
diff --git a/src/tests/resources/llm/complete-config/grading/tests.py b/src/tests/resources/llm/complete-config/grading/tests.py
new file mode 100644
index 0000000..b54e61e
--- /dev/null
+++ b/src/tests/resources/llm/complete-config/grading/tests.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+
+# Scaffolding necessary to set up ACCESS test
+import sys
+try: from universal.harness import *
+except: sys.path.append("../../universal/"); from harness import *
+
+# Grading test suite starts here
+
+script = grading_import("task", "script")
+
+class GradingTests(AccessTestCase):
+
+    def _test(self, sentence, expected):
+        actual = script.reverse_words(sentence)
+        self.hint(f"Reversal not correct for sentence='{sentence}'... expected result is '{expected}'!")
+        self.assertEqual(expected, actual)
+
+    def test_case1(self):
+        self._test("Hello World", "World Hello")
+
+    def test_case2(self):
+        self._test("  This   is  a  test  ", "test a is This")
+
+    def test_case3(self):
+        self._test("Python", "Python")
+
+    def test_case4(self):
+        self._test("", "")
+
+    def test_case5(self):
+        self._test("Hello, World!", "World! Hello,")
+
+    def test_case6(self):
+        self._test("123 456 789", "789 456 123")
+
+TestRunner().run(AccessTestSuite(1, [GradingTests]))
diff --git a/src/tests/resources/llm/complete-config/instructions_en.md b/src/tests/resources/llm/complete-config/instructions_en.md
new file mode 100644
index 0000000..dfbb0fb
--- /dev/null
+++ b/src/tests/resources/llm/complete-config/instructions_en.md
@@ -0,0 +1,11 @@
+# String Manipulation in Python
+
+## Task Description
+
+Your task is to implement a Python function called `reverse_words`. This function should take a single string input and return a string with the words in reverse order. For example:
+
+- Input: `"Hello World"`
+- Output: `"World Hello"`
+
+Additionally, reflect on the differences in complexity between reversing the order of words in a sentence and reversing the characters within each word in the `explanation.md` file provided. 
+Which operation do you think is more computationally efficient, and why? Consider factors such as string manipulation methods and time complexity in your explanation. Provide examples to support your reasoning.
diff --git a/src/tests/resources/llm/complete-config/rubrics/rubrics.toml b/src/tests/resources/llm/complete-config/rubrics/rubrics.toml
new file mode 100644
index 0000000..ed067b5
--- /dev/null
+++ b/src/tests/resources/llm/complete-config/rubrics/rubrics.toml
@@ -0,0 +1,9 @@
+[[rubrics]]
+id = "time_complexity_mentioned"
+title = "Mentioned the time complexity of both operations is O(n)"
+points = 0.5
+
+[[rubrics]]
+id = "asymptotically_equivalent"
+title = "Explained that both are asymptotically equivalent but in practice, reversing characters is slower"
+points = 0.5
diff --git a/src/tests/resources/llm/complete-config/solution/explanation.md b/src/tests/resources/llm/complete-config/solution/explanation.md
new file mode 100644
index 0000000..61da112
--- /dev/null
+++ b/src/tests/resources/llm/complete-config/solution/explanation.md
@@ -0,0 +1,16 @@
+Reversing the order of words in a sentence and reversing the characters within each word have different complexities.
+
+ Time Complexity of Both Operations:
+     Reversing the word order involves splitting the string into a list of words O(n), reversing the list(O(n), and joining it back into a string O(n). This results in an overall complexity of O(n).
+     Reversing characters within each word requires iterating through each word and reversing it (O(m) per word, where m is the word length). Since this must be done for all words, the total complexity remains O(n).
+
+ Which is More Efficient and Why:
+     Both operations have an O(n) complexity, but reversing words is generally more efficient in practice because it operates at a higher level (list reversal), whereas reversing characters requires more fine-grained string manipulation.
+     If implemented using in-place reversal, reversing characters within each word can introduce additional overhead compared to simple list manipulation.
+
+Example:
+
+    "Hello World" → "World Hello" (word order reversal)
+    "Hello World" → "olleH dlroW" (character reversal)
+
+While both methods scale similarly, reversing characters involves additional operations per word, making it slightly more complex in practical execution.
\ No newline at end of file
diff --git a/src/tests/resources/llm/complete-config/solution/script.py b/src/tests/resources/llm/complete-config/solution/script.py
new file mode 100644
index 0000000..f4e4528
--- /dev/null
+++ b/src/tests/resources/llm/complete-config/solution/script.py
@@ -0,0 +1,3 @@
+def reverse_words(sentence):
+    # Split the sentence into words, reverse the list, and join it back into a string
+    return ' '.join(sentence.split()[::-1])
diff --git a/src/tests/resources/llm/complete-config/task/explanation.md b/src/tests/resources/llm/complete-config/task/explanation.md
new file mode 100644
index 0000000..66f3e2d
--- /dev/null
+++ b/src/tests/resources/llm/complete-config/task/explanation.md
@@ -0,0 +1 @@
+[comment]: <> (Add your solution here:)
diff --git a/src/tests/resources/llm/complete-config/task/script.py b/src/tests/resources/llm/complete-config/task/script.py
new file mode 100644
index 0000000..7b3c71b
--- /dev/null
+++ b/src/tests/resources/llm/complete-config/task/script.py
@@ -0,0 +1,7 @@
+# Task: Implement a function `reverse_words` that takes a string
+# and returns the string with the order of words reversed.
+# Example: "Hello World" -> "World Hello"
+
+def reverse_words(sentence):
+    # TODO: Implement this function
+    pass
diff --git a/src/tests/resources/llm/invalid-model-family/config.toml b/src/tests/resources/llm/invalid-model-family/config.toml
new file mode 100644
index 0000000..0ad0948
--- /dev/null
+++ b/src/tests/resources/llm/invalid-model-family/config.toml
@@ -0,0 +1,48 @@
+slug = "string_manipulation"
+
+
+max_attempts = 6
+refill = 43200 # 12 hours
+max_points = 2
+
+[information.en]
+title = "String Manipulation in Python"
+instructions_file = "instructions_en.md"
+
+[evaluator]
+docker_image = "python:latest"
+run_command = "python -m task.script"
+test_command = "python -m unittest discover -v task"
+grade_command = "python -m grading.tests"
+
+[llm]
+submission = "task/explanation.md"
+rubrics = 'rubrics/rubrics.toml' # This file contains the rubrics for the task to guide the model
+examples = 'grading/examples.toml' # This file contains examples of the task for the model to learn from
+solution = 'solution/explanation.md' # This is the solution file used to guide the model
+cot = true # This will add "think step by step" to the context prompt in order to encourage the model to think step by step
+voting = 3 # This allows for the results to be evaluated 3 times, and the most common result is chosen. This setting will increase the time it takes to grade the task!
+post = "grading/post.md" # Adds further instruction at the end of the context prompt
+# pre = "grading/pre.md" # Adds further instruction in front of the context prompt
+# prompt = "grading/prompt.md" # replaces the context prompt with the content of the file. This is only for advanced usages!
+temperature = 0.2 # Decides the randomness of the gpt model
+model_family = "invalid" # This is invalid and should trigger an error
+model = "claude-3-5-sonnet-latest" # gpt-4o, gpt-4o-mini, claude-3-5-sonnet-latest, etc.
+max_points = 1 # Max points for the sub-task that is passed to the model
+
+[files]
+visible = [
+  "task/script.py",
+  "task/explanation.md",
+]
+editable = [
+  "task/script.py",
+  "task/explanation.md",
+]
+grading = [
+  "grading/tests.py",
+]
+solution = [
+  "solution/script.py",
+  "solution/explanation.md",
+]
diff --git a/src/tests/resources/llm/invalid-model-family/grading/examples.toml b/src/tests/resources/llm/invalid-model-family/grading/examples.toml
new file mode 100644
index 0000000..ba40170
--- /dev/null
+++ b/src/tests/resources/llm/invalid-model-family/grading/examples.toml
@@ -0,0 +1,7 @@
+[[examples]]
+answer = "Reversing word order and reversing characters both have O(n) complexity, but character reversal requires more operations per word, making it slightly less efficient in practice."
+points = { time_complexity_mentioned = 0.5, asymptotically_equivalent = 0.5 }
+
+[[examples]]
+answer = "Both operations have O(n) complexity because they process each character in the string."
+points = { time_complexity_mentioned = 0.5, asymptotically_equivalent = 0 }
diff --git a/src/tests/resources/llm/invalid-model-family/grading/post.md b/src/tests/resources/llm/invalid-model-family/grading/post.md
new file mode 100644
index 0000000..8a624ec
--- /dev/null
+++ b/src/tests/resources/llm/invalid-model-family/grading/post.md
@@ -0,0 +1 @@
+The student answer will not contain any code. This is expected, since you only need to grade the explanation according to the rubrics!
\ No newline at end of file
diff --git a/src/tests/resources/llm/invalid-model-family/grading/tests.py b/src/tests/resources/llm/invalid-model-family/grading/tests.py
new file mode 100644
index 0000000..b54e61e
--- /dev/null
+++ b/src/tests/resources/llm/invalid-model-family/grading/tests.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+
+# Scaffolding necessary to set up ACCESS test
+import sys
+try: from universal.harness import *
+except: sys.path.append("../../universal/"); from harness import *
+
+# Grading test suite starts here
+
+script = grading_import("task", "script")
+
+class GradingTests(AccessTestCase):
+
+    def _test(self, sentence, expected):
+        actual = script.reverse_words(sentence)
+        self.hint(f"Reversal not correct for sentence='{sentence}'... expected result is '{expected}'!")
+        self.assertEqual(expected, actual)
+
+    def test_case1(self):
+        self._test("Hello World", "World Hello")
+
+    def test_case2(self):
+        self._test("  This   is  a  test  ", "test a is This")
+
+    def test_case3(self):
+        self._test("Python", "Python")
+
+    def test_case4(self):
+        self._test("", "")
+
+    def test_case5(self):
+        self._test("Hello, World!", "World! Hello,")
+
+    def test_case6(self):
+        self._test("123 456 789", "789 456 123")
+
+TestRunner().run(AccessTestSuite(1, [GradingTests]))
diff --git a/src/tests/resources/llm/invalid-model-family/instructions_en.md b/src/tests/resources/llm/invalid-model-family/instructions_en.md
new file mode 100644
index 0000000..dfbb0fb
--- /dev/null
+++ b/src/tests/resources/llm/invalid-model-family/instructions_en.md
@@ -0,0 +1,11 @@
+# String Manipulation in Python
+
+## Task Description
+
+Your task is to implement a Python function called `reverse_words`. This function should take a single string input and return a string with the words in reverse order. For example:
+
+- Input: `"Hello World"`
+- Output: `"World Hello"`
+
+Additionally, reflect on the differences in complexity between reversing the order of words in a sentence and reversing the characters within each word in the `explanation.md` file provided. 
+Which operation do you think is more computationally efficient, and why? Consider factors such as string manipulation methods and time complexity in your explanation. Provide examples to support your reasoning.
diff --git a/src/tests/resources/llm/invalid-model-family/rubrics/rubrics.toml b/src/tests/resources/llm/invalid-model-family/rubrics/rubrics.toml
new file mode 100644
index 0000000..ed067b5
--- /dev/null
+++ b/src/tests/resources/llm/invalid-model-family/rubrics/rubrics.toml
@@ -0,0 +1,9 @@
+[[rubrics]]
+id = "time_complexity_mentioned"
+title = "Mentioned the time complexity of both operations is O(n)"
+points = 0.5
+
+[[rubrics]]
+id = "asymptotically_equivalent"
+title = "Explained that both are asymptotically equivalent but in practice, reversing characters is slower"
+points = 0.5
diff --git a/src/tests/resources/llm/invalid-model-family/solution/explanation.md b/src/tests/resources/llm/invalid-model-family/solution/explanation.md
new file mode 100644
index 0000000..61da112
--- /dev/null
+++ b/src/tests/resources/llm/invalid-model-family/solution/explanation.md
@@ -0,0 +1,16 @@
+Reversing the order of words in a sentence and reversing the characters within each word have different complexities.
+
+ Time Complexity of Both Operations:
+     Reversing the word order involves splitting the string into a list of words O(n), reversing the list(O(n), and joining it back into a string O(n). This results in an overall complexity of O(n).
+     Reversing characters within each word requires iterating through each word and reversing it (O(m) per word, where m is the word length). Since this must be done for all words, the total complexity remains O(n).
+
+ Which is More Efficient and Why:
+     Both operations have an O(n) complexity, but reversing words is generally more efficient in practice because it operates at a higher level (list reversal), whereas reversing characters requires more fine-grained string manipulation.
+     If implemented using in-place reversal, reversing characters within each word can introduce additional overhead compared to simple list manipulation.
+
+Example:
+
+    "Hello World" → "World Hello" (word order reversal)
+    "Hello World" → "olleH dlroW" (character reversal)
+
+While both methods scale similarly, reversing characters involves additional operations per word, making it slightly more complex in practical execution.
\ No newline at end of file
diff --git a/src/tests/resources/llm/invalid-model-family/solution/script.py b/src/tests/resources/llm/invalid-model-family/solution/script.py
new file mode 100644
index 0000000..f4e4528
--- /dev/null
+++ b/src/tests/resources/llm/invalid-model-family/solution/script.py
@@ -0,0 +1,3 @@
+def reverse_words(sentence):
+    # Split the sentence into words, reverse the list, and join it back into a string
+    return ' '.join(sentence.split()[::-1])
diff --git a/src/tests/resources/llm/invalid-model-family/task/explanation.md b/src/tests/resources/llm/invalid-model-family/task/explanation.md
new file mode 100644
index 0000000..66f3e2d
--- /dev/null
+++ b/src/tests/resources/llm/invalid-model-family/task/explanation.md
@@ -0,0 +1 @@
+[comment]: <> (Add your solution here:)
diff --git a/src/tests/resources/llm/invalid-model-family/task/script.py b/src/tests/resources/llm/invalid-model-family/task/script.py
new file mode 100644
index 0000000..7b3c71b
--- /dev/null
+++ b/src/tests/resources/llm/invalid-model-family/task/script.py
@@ -0,0 +1,7 @@
+# Task: Implement a function `reverse_words` that takes a string
+# and returns the string with the order of words reversed.
+# Example: "Hello World" -> "World Hello"
+
+def reverse_words(sentence):
+    # TODO: Implement this function
+    pass
diff --git a/src/tests/resources/llm/invalid-permissions/config.toml b/src/tests/resources/llm/invalid-permissions/config.toml
new file mode 100644
index 0000000..58401dd
--- /dev/null
+++ b/src/tests/resources/llm/invalid-permissions/config.toml
@@ -0,0 +1,49 @@
+slug = "string_manipulation"
+
+
+max_attempts = 6
+refill = 43200 # 12 hours
+max_points = 2
+
+[information.en]
+title = "String Manipulation in Python"
+instructions_file = "instructions_en.md"
+
+[evaluator]
+docker_image = "python:latest"
+run_command = "python -m task.script"
+test_command = "python -m unittest discover -v task"
+grade_command = "python -m grading.tests"
+
+[llm]
+submission = "task/explanation.md"
+rubrics = "rubrics/rubrics.toml" # This file contains the rubrics for the task to guide the model
+examples = "grading/examples.toml" # This file contains examples of the task for the model to learn from
+solution = "solution/explanation.md" # This is the solution file used to guide the model
+cot = true # This will add "think step by step" to the context prompt in order to encourage the model to think step by step
+voting = 3 # This allows for the results to be evaluated 3 times, and the most common result is chosen. This setting will increase the time it takes to grade the task!
+post = "grading/post.md" # Adds further instruction at the end of the context prompt
+# pre = "grading/pre.md" # Adds further instruction in front of the context prompt
+# prompt = "grading/prompt.md" # replaces the context prompt with the content of the file. This is only for advanced usages!
+temperature = 0.2 # Decides the randomness of the gpt model
+model_family = "claude" # gpt or claude
+model = "claude-3-5-sonnet-latest" # gpt-4o, gpt-4o-mini, claude-3-5-sonnet-latest, etc.
+max_points = 1 # Max points for the sub-task that is passed to the model
+
+[files]
+visible = [
+  "task/script.py",
+  "task/explanation.md",
+  "rubrics/rubrics.toml"
+]
+editable = [
+  "task/script.py",
+  "task/explanation.md",
+]
+grading = [
+  "grading/tests.py",
+]
+solution = [
+  "solution/script.py",
+  "solution/explanation.md",
+]
diff --git a/src/tests/resources/llm/invalid-permissions/grading/examples.toml b/src/tests/resources/llm/invalid-permissions/grading/examples.toml
new file mode 100644
index 0000000..ba40170
--- /dev/null
+++ b/src/tests/resources/llm/invalid-permissions/grading/examples.toml
@@ -0,0 +1,7 @@
+[[examples]]
+answer = "Reversing word order and reversing characters both have O(n) complexity, but character reversal requires more operations per word, making it slightly less efficient in practice."
+points = { time_complexity_mentioned = 0.5, asymptotically_equivalent = 0.5 }
+
+[[examples]]
+answer = "Both operations have O(n) complexity because they process each character in the string."
+points = { time_complexity_mentioned = 0.5, asymptotically_equivalent = 0 }
diff --git a/src/tests/resources/llm/invalid-permissions/grading/post.md b/src/tests/resources/llm/invalid-permissions/grading/post.md
new file mode 100644
index 0000000..8a624ec
--- /dev/null
+++ b/src/tests/resources/llm/invalid-permissions/grading/post.md
@@ -0,0 +1 @@
+The student answer will not contain any code. This is expected, since you only need to grade the explanation according to the rubrics!
\ No newline at end of file
diff --git a/src/tests/resources/llm/invalid-permissions/grading/tests.py b/src/tests/resources/llm/invalid-permissions/grading/tests.py
new file mode 100644
index 0000000..b54e61e
--- /dev/null
+++ b/src/tests/resources/llm/invalid-permissions/grading/tests.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+
+# Scaffolding necessary to set up ACCESS test
+import sys
+try: from universal.harness import *
+except: sys.path.append("../../universal/"); from harness import *
+
+# Grading test suite starts here
+
+script = grading_import("task", "script")
+
+class GradingTests(AccessTestCase):
+
+    def _test(self, sentence, expected):
+        actual = script.reverse_words(sentence)
+        self.hint(f"Reversal not correct for sentence='{sentence}'... expected result is '{expected}'!")
+        self.assertEqual(expected, actual)
+
+    def test_case1(self):
+        self._test("Hello World", "World Hello")
+
+    def test_case2(self):
+        self._test("  This   is  a  test  ", "test a is This")
+
+    def test_case3(self):
+        self._test("Python", "Python")
+
+    def test_case4(self):
+        self._test("", "")
+
+    def test_case5(self):
+        self._test("Hello, World!", "World! Hello,")
+
+    def test_case6(self):
+        self._test("123 456 789", "789 456 123")
+
+TestRunner().run(AccessTestSuite(1, [GradingTests]))
diff --git a/src/tests/resources/llm/invalid-permissions/instructions_en.md b/src/tests/resources/llm/invalid-permissions/instructions_en.md
new file mode 100644
index 0000000..dfbb0fb
--- /dev/null
+++ b/src/tests/resources/llm/invalid-permissions/instructions_en.md
@@ -0,0 +1,11 @@
+# String Manipulation in Python
+
+## Task Description
+
+Your task is to implement a Python function called `reverse_words`. This function should take a single string input and return a string with the words in reverse order. For example:
+
+- Input: `"Hello World"`
+- Output: `"World Hello"`
+
+Additionally, reflect on the differences in complexity between reversing the order of words in a sentence and reversing the characters within each word in the `explanation.md` file provided. 
+Which operation do you think is more computationally efficient, and why? Consider factors such as string manipulation methods and time complexity in your explanation. Provide examples to support your reasoning.
diff --git a/src/tests/resources/llm/invalid-permissions/rubrics/rubrics.toml b/src/tests/resources/llm/invalid-permissions/rubrics/rubrics.toml
new file mode 100644
index 0000000..ed067b5
--- /dev/null
+++ b/src/tests/resources/llm/invalid-permissions/rubrics/rubrics.toml
@@ -0,0 +1,9 @@
+[[rubrics]]
+id = "time_complexity_mentioned"
+title = "Mentioned the time complexity of both operations is O(n)"
+points = 0.5
+
+[[rubrics]]
+id = "asymptotically_equivalent"
+title = "Explained that both are asymptotically equivalent but in practice, reversing characters is slower"
+points = 0.5
diff --git a/src/tests/resources/llm/invalid-permissions/solution/explanation.md b/src/tests/resources/llm/invalid-permissions/solution/explanation.md
new file mode 100644
index 0000000..61da112
--- /dev/null
+++ b/src/tests/resources/llm/invalid-permissions/solution/explanation.md
@@ -0,0 +1,16 @@
+Reversing the order of words in a sentence and reversing the characters within each word have different complexities.
+
+ Time Complexity of Both Operations:
+     Reversing the word order involves splitting the string into a list of words O(n), reversing the list(O(n), and joining it back into a string O(n). This results in an overall complexity of O(n).
+     Reversing characters within each word requires iterating through each word and reversing it (O(m) per word, where m is the word length). Since this must be done for all words, the total complexity remains O(n).
+
+ Which is More Efficient and Why:
+     Both operations have an O(n) complexity, but reversing words is generally more efficient in practice because it operates at a higher level (list reversal), whereas reversing characters requires more fine-grained string manipulation.
+     If implemented using in-place reversal, reversing characters within each word can introduce additional overhead compared to simple list manipulation.
+
+Example:
+
+    "Hello World" → "World Hello" (word order reversal)
+    "Hello World" → "olleH dlroW" (character reversal)
+
+While both methods scale similarly, reversing characters involves additional operations per word, making it slightly more complex in practical execution.
\ No newline at end of file
diff --git a/src/tests/resources/llm/invalid-permissions/solution/script.py b/src/tests/resources/llm/invalid-permissions/solution/script.py
new file mode 100644
index 0000000..f4e4528
--- /dev/null
+++ b/src/tests/resources/llm/invalid-permissions/solution/script.py
@@ -0,0 +1,3 @@
+def reverse_words(sentence):
+    # Split the sentence into words, reverse the list, and join it back into a string
+    return ' '.join(sentence.split()[::-1])
diff --git a/src/tests/resources/llm/invalid-permissions/task/explanation.md b/src/tests/resources/llm/invalid-permissions/task/explanation.md
new file mode 100644
index 0000000..66f3e2d
--- /dev/null
+++ b/src/tests/resources/llm/invalid-permissions/task/explanation.md
@@ -0,0 +1 @@
+[comment]: <> (Add your solution here:)
diff --git a/src/tests/resources/llm/invalid-permissions/task/script.py b/src/tests/resources/llm/invalid-permissions/task/script.py
new file mode 100644
index 0000000..7b3c71b
--- /dev/null
+++ b/src/tests/resources/llm/invalid-permissions/task/script.py
@@ -0,0 +1,7 @@
+# Task: Implement a function `reverse_words` that takes a string
+# and returns the string with the order of words reversed.
+# Example: "Hello World" -> "World Hello"
+
+def reverse_words(sentence):
+    # TODO: Implement this function
+    pass
diff --git a/src/tests/resources/llm/minimal-config/config.toml b/src/tests/resources/llm/minimal-config/config.toml
new file mode 100644
index 0000000..39aecf6
--- /dev/null
+++ b/src/tests/resources/llm/minimal-config/config.toml
@@ -0,0 +1,36 @@
+slug = "string_manipulation"
+
+
+max_attempts = 6
+refill = 43200 # 12 hours
+max_points = 2
+
+[information.en]
+title = "String Manipulation in Python"
+instructions_file = "instructions_en.md"
+
+[evaluator]
+docker_image = "python:latest"
+run_command = "python -m task.script"
+test_command = "python -m unittest discover -v task"
+grade_command = "python -m grading.tests"
+
+[llm]
+submission = "task/explanation.md"
+
+[files]
+visible = [
+  "task/script.py",
+  "task/explanation.md",
+]
+editable = [
+  "task/script.py",
+  "task/explanation.md",
+]
+grading = [
+  "grading/tests.py",
+]
+solution = [
+  "solution/script.py",
+  "solution/explanation.md",
+]
diff --git a/src/tests/resources/llm/minimal-config/instructions_en.md b/src/tests/resources/llm/minimal-config/instructions_en.md
new file mode 100644
index 0000000..dfbb0fb
--- /dev/null
+++ b/src/tests/resources/llm/minimal-config/instructions_en.md
@@ -0,0 +1,11 @@
+# String Manipulation in Python
+
+## Task Description
+
+Your task is to implement a Python function called `reverse_words`. This function should take a single string input and return a string with the words in reverse order. For example:
+
+- Input: `"Hello World"`
+- Output: `"World Hello"`
+
+Additionally, reflect on the differences in complexity between reversing the order of words in a sentence and reversing the characters within each word in the `explanation.md` file provided. 
+Which operation do you think is more computationally efficient, and why? Consider factors such as string manipulation methods and time complexity in your explanation. Provide examples to support your reasoning.
diff --git a/src/tests/resources/llm/minimal-config/solution/explanation.md b/src/tests/resources/llm/minimal-config/solution/explanation.md
new file mode 100644
index 0000000..61da112
--- /dev/null
+++ b/src/tests/resources/llm/minimal-config/solution/explanation.md
@@ -0,0 +1,16 @@
+Reversing the order of words in a sentence and reversing the characters within each word have different complexities.
+
+ Time Complexity of Both Operations:
+     Reversing the word order involves splitting the string into a list of words O(n), reversing the list(O(n), and joining it back into a string O(n). This results in an overall complexity of O(n).
+     Reversing characters within each word requires iterating through each word and reversing it (O(m) per word, where m is the word length). Since this must be done for all words, the total complexity remains O(n).
+
+ Which is More Efficient and Why:
+     Both operations have an O(n) complexity, but reversing words is generally more efficient in practice because it operates at a higher level (list reversal), whereas reversing characters requires more fine-grained string manipulation.
+     If implemented using in-place reversal, reversing characters within each word can introduce additional overhead compared to simple list manipulation.
+
+Example:
+
+    "Hello World" → "World Hello" (word order reversal)
+    "Hello World" → "olleH dlroW" (character reversal)
+
+While both methods scale similarly, reversing characters involves additional operations per word, making it slightly more complex in practical execution.
\ No newline at end of file
diff --git a/src/tests/resources/llm/minimal-config/solution/script.py b/src/tests/resources/llm/minimal-config/solution/script.py
new file mode 100644
index 0000000..f4e4528
--- /dev/null
+++ b/src/tests/resources/llm/minimal-config/solution/script.py
@@ -0,0 +1,3 @@
+def reverse_words(sentence):
+    # Split the sentence into words, reverse the list, and join it back into a string
+    return ' '.join(sentence.split()[::-1])
diff --git a/src/tests/resources/llm/minimal-config/task/explanation.md b/src/tests/resources/llm/minimal-config/task/explanation.md
new file mode 100644
index 0000000..66f3e2d
--- /dev/null
+++ b/src/tests/resources/llm/minimal-config/task/explanation.md
@@ -0,0 +1 @@
+[comment]: <> (Add your solution here:)
diff --git a/src/tests/resources/llm/minimal-config/task/script.py b/src/tests/resources/llm/minimal-config/task/script.py
new file mode 100644
index 0000000..7b3c71b
--- /dev/null
+++ b/src/tests/resources/llm/minimal-config/task/script.py
@@ -0,0 +1,7 @@
+# Task: Implement a function `reverse_words` that takes a string
+# and returns the string with the order of words reversed.
+# Example: "Hello World" -> "World Hello"
+
+def reverse_words(sentence):
+    # TODO: Implement this function
+    pass
diff --git a/src/tests/resources/llm/missing-files/config.toml b/src/tests/resources/llm/missing-files/config.toml
new file mode 100644
index 0000000..b7d3842
--- /dev/null
+++ b/src/tests/resources/llm/missing-files/config.toml
@@ -0,0 +1,37 @@
+slug = "string_manipulation"
+
+
+max_attempts = 6
+refill = 43200 # 12 hours
+max_points = 2
+
+[information.en]
+title = "String Manipulation in Python"
+instructions_file = "instructions_en.md"
+
+[evaluator]
+docker_image = "python:latest"
+run_command = "python -m task.script"
+test_command = "python -m unittest discover -v task"
+grade_command = "python -m grading.tests"
+
+[llm]
+submission = "task/explanation.md"
+solution = "solution/explanation.md"
+
+[files]
+visible = [
+  "task/script.py",
+  "task/explanation.md",
+]
+editable = [
+  "task/script.py",
+  "task/explanation.md",
+]
+grading = [
+  "grading/tests.py",
+]
+solution = [
+  "solution/script.py",
+  "solution/explanation.md",
+]
diff --git a/src/tests/resources/llm/missing-files/instructions_en.md b/src/tests/resources/llm/missing-files/instructions_en.md
new file mode 100644
index 0000000..dfbb0fb
--- /dev/null
+++ b/src/tests/resources/llm/missing-files/instructions_en.md
@@ -0,0 +1,11 @@
+# String Manipulation in Python
+
+## Task Description
+
+Your task is to implement a Python function called `reverse_words`. This function should take a single string input and return a string with the words in reverse order. For example:
+
+- Input: `"Hello World"`
+- Output: `"World Hello"`
+
+Additionally, reflect on the differences in complexity between reversing the order of words in a sentence and reversing the characters within each word in the `explanation.md` file provided. 
+Which operation do you think is more computationally efficient, and why? Consider factors such as string manipulation methods and time complexity in your explanation. Provide examples to support your reasoning.
diff --git a/src/tests/resources/llm/missing-files/task/explanation.md b/src/tests/resources/llm/missing-files/task/explanation.md
new file mode 100644
index 0000000..66f3e2d
--- /dev/null
+++ b/src/tests/resources/llm/missing-files/task/explanation.md
@@ -0,0 +1 @@
+[comment]: <> (Add your solution here:)
diff --git a/src/tests/resources/llm/missing-files/task/script.py b/src/tests/resources/llm/missing-files/task/script.py
new file mode 100644
index 0000000..7b3c71b
--- /dev/null
+++ b/src/tests/resources/llm/missing-files/task/script.py
@@ -0,0 +1,7 @@
+# Task: Implement a function `reverse_words` that takes a string
+# and returns the string with the order of words reversed.
+# Example: "Hello World" -> "World Hello"
+
+def reverse_words(sentence):
+    # TODO: Implement this function
+    pass
diff --git a/src/tests/test_llm_execution.py b/src/tests/test_llm_execution.py
new file mode 100644
index 0000000..7442da1
--- /dev/null
+++ b/src/tests/test_llm_execution.py
@@ -0,0 +1,83 @@
+import unittest
+from importlib.resources import files
+from access_cli_sealuzh.main import AccessValidator
+import os
+class LLMExecutionTests(unittest.TestCase):
+
+    def validator(self, directory, commands=None, llm_api_key=None):
+        """Helper to create validator with LLM settings."""
+        if commands is None:
+            commands = []
+
+        # Get LLM API key from environment variable (Must pass in as LLM_API_KEY="your_key" for testing in front of the command)
+        llm_api_key = llm_api_key or os.getenv('LLM_API_KEY', None)
+
+        if llm_api_key is None:
+            raise ValueError("LLM_API_KEY environment variable is not set. Please set it to your LLM API key.")
+            
+        class Args:
+            def __init__(self):
+                self.directory = directory
+                self.level = "task"
+                self.verbose = True
+                self.llm_only = True
+                self.llm_api_key = llm_api_key
+                self.assistant_url = "http://localhost:4000"
+                self.llm_keep_service = False
+                self.grade_template = "template" in commands
+                self.grade_solution = "solution" in commands
+                self.test_solution = False
+                self.solve_command = None
+                self.global_file = set()
+                self.course_root = None
+                self.auto_detect = False
+                self.user = None
+                self.llm_model = None
+
+        return AccessValidator(Args())
+
+    def test_minimal_llm_config(self):
+        """Test LLM grading with minimal config (only submission)."""
+        validator = self.validator(
+            files('tests.resources.llm').joinpath('minimal-config'),
+            ["template"]
+        )
+        errors = validator.run().error_list() 
+        self.assertEqual(0, len(errors), f"Expected no errors but got:\n{'\n'.join(errors)}. Are you using the correct API key?")
+
+    def test_complete_llm_config(self):
+        """Test LLM grading with complete config (all optional fields)."""
+        validator = self.validator(
+            files('tests.resources.llm').joinpath('complete-config'),
+            ["solution"]
+        )
+        errors = validator.run().error_list()
+        self.assertEqual(0, len(errors), f"Expected no errors but got:\n{'\n'.join(errors)}. Are you using the correct API key?")
+
+    def test_invalid_model_family(self):
+        """Test error with invalid model family in config."""
+        validator = self.validator(
+            files('tests.resources.llm').joinpath('invalid-model-family'),
+            ["template"]
+        )
+        errors = validator.run().error_list()
+        self.assertEqual(1, len(errors))
+
+
+    def test_missing_required_files(self):
+        """Test error when required files are missing."""
+        validator = self.validator(
+            files('tests.resources.llm').joinpath('missing-files'),
+            ["template"]
+        )
+        errors = validator.run().error_list()
+        self.assertEqual(1, len(errors))
+
+    def test_invalid_file_permissions(self):
+        """Test error when LLM files have wrong permissions."""
+        validator = self.validator(
+            files('tests.resources.llm').joinpath('invalid-permissions'),
+            ["template"]
+        )
+        errors = validator.run().error_list()
+        self.assertEqual(1, len(errors))
\ No newline at end of file

From 78066b5c00a27f2c11e66abdadf3f2fcef26453b Mon Sep 17 00:00:00 2001
From: rnichi1 <71671446+rnichi1@users.noreply.github.com>
Date: Sat, 15 Feb 2025 23:55:16 +0100
Subject: [PATCH 3/3] remove todo comment

---
 src/access_cli_sealuzh/__init__.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/access_cli_sealuzh/__init__.py b/src/access_cli_sealuzh/__init__.py
index bf06031..98a195f 100644
--- a/src/access_cli_sealuzh/__init__.py
+++ b/src/access_cli_sealuzh/__init__.py
@@ -4,14 +4,6 @@
 
 def main():
     from access_cli_sealuzh.main import AccessValidator, autodetect
-
-    # TODO GBAI:
-    # * add option to pass API KEY
-    # * add additional options the service might need (model selection?)
-    # * by default, the AI service should stop after validation, add an option
-    #   to keep the service running
-    # * ensure that there is some combination of options such that ONLY the AI
-    #   grading is executed, since TAs will be relying on it to design the task
     parser = argparse.ArgumentParser(
         prog = 'access-cli',
         description = 'Validate ACCESS course configurations using the CLI')