mp-access · rnichi1 · Feb 15, 2025 · Feb 15, 2025 · Feb 15, 2025
diff --git a/.gitignore b/.gitignore
@@ -13,6 +13,7 @@ __pycache__/
 # C extensions
 *.so
 
+
 # Distribution / packaging
 .Python
 build/
@@ -76,5 +77,8 @@ tags
 # Persistent undo
 [._]*.un~
 
+# Local
+.idea/
+
 # End of https://www.toptal.com/developers/gitignore/api/vim,python
 
diff --git a/src/access_cli_sealuzh/__init__.py b/src/access_cli_sealuzh/__init__.py
@@ -4,14 +4,6 @@
 
 def main():
     from access_cli_sealuzh.main import AccessValidator, autodetect
-
-    # TODO GBAI:
-    # * add option to pass API KEY
-    # * add additional options the service might need (model selection?)
-    # * by default, the AI service should stop after validation, add an option
-    #   to keep the service running
-    # * ensure that there is some combination of options such that ONLY the AI
-    #   grading is executed, since TAs will be relying on it to design the task
     parser = argparse.ArgumentParser(
         prog = 'access-cli',
         description = 'Validate ACCESS course configurations using the CLI')
@@ -49,6 +41,17 @@ def main():
         help = "recurse into nested structures (assignments/tasks) if applicable")
     parser.add_argument('-A', '--auto-detect', action='store_true', default=False,
         help = "attempt to auto-detect what is being validated")
+    parser.add_argument('-k', '--llm-api-key', type=str,
+        help = "API key for the LLM service. Create one at the corresponding service provider.")
+    parser.add_argument('-K', '--llm-keep-service', action=argparse.BooleanOptionalAction,
+        help = "Keep LLM service running after validation for further grading")
+    parser.add_argument('-L', '--llm-only', action=argparse.BooleanOptionalAction,
+        help = "Only run LLM grading (for TAs designing tasks)")
+    parser.add_argument('-U', '--assistant-url',
+        default="http://localhost:4000",
+        help = "URL of the assistant service")
+    parser.add_argument('-M', '--llm-model', type=str,
+        help = "Model to use for LLM grading")
     args = parser.parse_args()
 
 

diff --git a/src/access_cli_sealuzh/main.py b/src/access_cli_sealuzh/main.py
diff --git a/src/access_cli_sealuzh/schema.py b/src/access_cli_sealuzh/schema.py
@@ -76,8 +76,9 @@
 # - if each file in files actually exists
 # - that none of the grading or solution files are editable or visible
 # - that editable files are also visible
+# - that if you use llm evaluation, all defined files in the config are also present as files (submission, rubrics, examples, solution)
 # - OPTIONALLY: that the run, test and grade commands execute correctly
-task_schema = { # TODO GBAI: add additional fields
+task_schema = {
     "slug":         {'required': True, 'type': 'string'},
     "authors":      {'required': False, 'type': 'list',
                      'schema': {'type': 'string'}},
@@ -102,6 +103,21 @@
                                      'schema': {'type': 'string'}},
                      "persist":     {'required': False, 'type': 'list',
                                      'schema': {'type': 'string'}}
-                    }}
+                    }},
+    "llm": {'required': False, 'type': 'dict', 'schema': {
+        'submission': {'required': True, 'type': 'string'},
+        'rubrics': {'required': False, 'type': 'string'},
+        'examples': {'required': False, 'type': 'string'},
+        'solution': {'required': False, 'type': 'string'},
+        'cot': {'required': False, 'type': 'boolean'},
+        'voting': {'required': False, 'type': 'integer'},
+        'post': {'required': False, 'type': 'string'},
+        'pre': {'required': False, 'type': 'string'},
+        'prompt': {'required': False, 'type': 'string'},
+        'temperature': {'required': False, 'type': 'float'},
+        'model': {'required': False, 'type': 'string'},
+        'max_points': {'required': False, 'type': 'float'},
+        'model_family': {'required': False, 'type': 'string'}
+    }},
 }
 
diff --git a/src/tests/resources/llm/complete-config/config.toml b/src/tests/resources/llm/complete-config/config.toml
@@ -0,0 +1,48 @@
+slug = "string_manipulation"
+
+
+max_attempts = 6
+refill = 43200 # 12 hours
+max_points = 2
+
+[information.en]
+title = "String Manipulation in Python"
+instructions_file = "instructions_en.md"
+
+[evaluator]
+docker_image = "python:latest"
+run_command = "python -m task.script"
+test_command = "python -m unittest discover -v task"
+grade_command = "python -m grading.tests"
+
+[llm]
+submission = "task/explanation.md"
+rubrics = 'rubrics/rubrics.toml' # This file contains the rubrics for the task to guide the model
+examples = 'grading/examples.toml' # This file contains examples of the task for the model to learn from
+solution = 'solution/explanation.md' # This is the solution file used to guide the model
+cot = true # This will add "think step by step" to the context prompt in order to encourage the model to think step by step
+voting = 3 # This allows for the results to be evaluated 3 times, and the most common result is chosen. This setting will increase the time it takes to grade the task!
+post = "grading/post.md" # Adds further instruction at the end of the context prompt
+# pre = "grading/pre.md" # Adds further instruction in front of the context prompt
+# prompt = "grading/prompt.md" # replaces the context prompt with the content of the file. This is only for advanced usages!
+temperature = 0.2 # Decides the randomness of the gpt model
+model_family = "claude" # gpt or claude
+model = "claude-3-5-sonnet-latest" # gpt-4o, gpt-4o-mini, claude-3-5-sonnet-latest, etc.
+max_points = 1 # Max points for the sub-task that is passed to the model
+
+[files]
+visible = [
+  "task/script.py",
+  "task/explanation.md",
+]
+editable = [
+  "task/script.py",
+  "task/explanation.md",
+]
+grading = [
+  "grading/tests.py",
+]
+solution = [
+  "solution/script.py",
+  "solution/explanation.md",
+]
diff --git a/src/tests/resources/llm/complete-config/grading/examples.toml b/src/tests/resources/llm/complete-config/grading/examples.toml
@@ -0,0 +1,7 @@
+[[examples]]
+answer = "Reversing word order and reversing characters both have O(n) complexity, but character reversal requires more operations per word, making it slightly less efficient in practice."
+points = { time_complexity_mentioned = 0.5, asymptotically_equivalent = 0.5 }
+
+[[examples]]
+answer = "Both operations have O(n) complexity because they process each character in the string."
+points = { time_complexity_mentioned = 0.5, asymptotically_equivalent = 0 }
diff --git a/src/tests/resources/llm/complete-config/grading/post.md b/src/tests/resources/llm/complete-config/grading/post.md
@@ -0,0 +1 @@
+The student answer will not contain any code. This is expected, since you only need to grade the explanation according to the rubrics!
diff --git a/src/tests/resources/llm/complete-config/grading/tests.py b/src/tests/resources/llm/complete-config/grading/tests.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+
+# Scaffolding necessary to set up ACCESS test
+import sys
+try: from universal.harness import *
+except: sys.path.append("../../universal/"); from harness import *
+
+# Grading test suite starts here
+
+script = grading_import("task", "script")
+
+class GradingTests(AccessTestCase):
+
+    def _test(self, sentence, expected):
+        actual = script.reverse_words(sentence)
+        self.hint(f"Reversal not correct for sentence='{sentence}'... expected result is '{expected}'!")
+        self.assertEqual(expected, actual)
+
+    def test_case1(self):
+        self._test("Hello World", "World Hello")
+
+    def test_case2(self):
+        self._test("  This   is  a  test  ", "test a is This")
+
+    def test_case3(self):
+        self._test("Python", "Python")
+
+    def test_case4(self):
+        self._test("", "")
+
+    def test_case5(self):
+        self._test("Hello, World!", "World! Hello,")
+
+    def test_case6(self):
+        self._test("123 456 789", "789 456 123")
+
+TestRunner().run(AccessTestSuite(1, [GradingTests]))
diff --git a/src/tests/resources/llm/complete-config/instructions_en.md b/src/tests/resources/llm/complete-config/instructions_en.md
@@ -0,0 +1,11 @@
+# String Manipulation in Python
+
+## Task Description
+
+Your task is to implement a Python function called `reverse_words`. This function should take a single string input and return a string with the words in reverse order. For example:
+
+- Input: `"Hello World"`
+- Output: `"World Hello"`
+
+Additionally, reflect on the differences in complexity between reversing the order of words in a sentence and reversing the characters within each word in the `explanation.md` file provided. 
+Which operation do you think is more computationally efficient, and why? Consider factors such as string manipulation methods and time complexity in your explanation. Provide examples to support your reasoning.
diff --git a/src/tests/resources/llm/complete-config/rubrics/rubrics.toml b/src/tests/resources/llm/complete-config/rubrics/rubrics.toml
@@ -0,0 +1,9 @@
+[[rubrics]]
+id = "time_complexity_mentioned"
+title = "Mentioned the time complexity of both operations is O(n)"
+points = 0.5
+
+[[rubrics]]
+id = "asymptotically_equivalent"
+title = "Explained that both are asymptotically equivalent but in practice, reversing characters is slower"
+points = 0.5
diff --git a/src/tests/resources/llm/complete-config/solution/explanation.md b/src/tests/resources/llm/complete-config/solution/explanation.md
@@ -0,0 +1,16 @@
+Reversing the order of words in a sentence and reversing the characters within each word have different complexities.
+
+ Time Complexity of Both Operations:
+     Reversing the word order involves splitting the string into a list of words O(n), reversing the list(O(n), and joining it back into a string O(n). This results in an overall complexity of O(n).
+     Reversing characters within each word requires iterating through each word and reversing it (O(m) per word, where m is the word length). Since this must be done for all words, the total complexity remains O(n).
+
+ Which is More Efficient and Why:
+     Both operations have an O(n) complexity, but reversing words is generally more efficient in practice because it operates at a higher level (list reversal), whereas reversing characters requires more fine-grained string manipulation.
+     If implemented using in-place reversal, reversing characters within each word can introduce additional overhead compared to simple list manipulation.
+
+Example:
+
+    "Hello World" → "World Hello" (word order reversal)
+    "Hello World" → "olleH dlroW" (character reversal)
+
+While both methods scale similarly, reversing characters involves additional operations per word, making it slightly more complex in practical execution.
diff --git a/src/tests/resources/llm/complete-config/solution/script.py b/src/tests/resources/llm/complete-config/solution/script.py
@@ -0,0 +1,3 @@
+def reverse_words(sentence):
+    # Split the sentence into words, reverse the list, and join it back into a string
+    return ' '.join(sentence.split()[::-1])
diff --git a/src/tests/resources/llm/complete-config/task/explanation.md b/src/tests/resources/llm/complete-config/task/explanation.md
@@ -0,0 +1 @@
+[comment]: <> (Add your solution here:)
diff --git a/src/tests/resources/llm/complete-config/task/script.py b/src/tests/resources/llm/complete-config/task/script.py
@@ -0,0 +1,7 @@
+# Task: Implement a function `reverse_words` that takes a string
+# and returns the string with the order of words reversed.
+# Example: "Hello World" -> "World Hello"
+
+def reverse_words(sentence):
+    # TODO: Implement this function
+    pass
diff --git a/src/tests/resources/llm/invalid-model-family/config.toml b/src/tests/resources/llm/invalid-model-family/config.toml
@@ -0,0 +1,48 @@
+slug = "string_manipulation"
+
+
+max_attempts = 6
+refill = 43200 # 12 hours
+max_points = 2
+
+[information.en]
+title = "String Manipulation in Python"
+instructions_file = "instructions_en.md"
+
+[evaluator]
+docker_image = "python:latest"
+run_command = "python -m task.script"
+test_command = "python -m unittest discover -v task"
+grade_command = "python -m grading.tests"
+
+[llm]
+submission = "task/explanation.md"
+rubrics = 'rubrics/rubrics.toml' # This file contains the rubrics for the task to guide the model
+examples = 'grading/examples.toml' # This file contains examples of the task for the model to learn from
+solution = 'solution/explanation.md' # This is the solution file used to guide the model
+cot = true # This will add "think step by step" to the context prompt in order to encourage the model to think step by step
+voting = 3 # This allows for the results to be evaluated 3 times, and the most common result is chosen. This setting will increase the time it takes to grade the task!
+post = "grading/post.md" # Adds further instruction at the end of the context prompt
+# pre = "grading/pre.md" # Adds further instruction in front of the context prompt
+# prompt = "grading/prompt.md" # replaces the context prompt with the content of the file. This is only for advanced usages!
+temperature = 0.2 # Decides the randomness of the gpt model
+model_family = "invalid" # This is invalid and should trigger an error
+model = "claude-3-5-sonnet-latest" # gpt-4o, gpt-4o-mini, claude-3-5-sonnet-latest, etc.
+max_points = 1 # Max points for the sub-task that is passed to the model
+
+[files]
+visible = [
+  "task/script.py",
+  "task/explanation.md",
+]
+editable = [
+  "task/script.py",
+  "task/explanation.md",
+]
+grading = [
+  "grading/tests.py",
+]
+solution = [
+  "solution/script.py",
+  "solution/explanation.md",
+]
diff --git a/src/tests/resources/llm/invalid-model-family/grading/examples.toml b/src/tests/resources/llm/invalid-model-family/grading/examples.toml
@@ -0,0 +1,7 @@
+[[examples]]
+answer = "Reversing word order and reversing characters both have O(n) complexity, but character reversal requires more operations per word, making it slightly less efficient in practice."
+points = { time_complexity_mentioned = 0.5, asymptotically_equivalent = 0.5 }
+
+[[examples]]
+answer = "Both operations have O(n) complexity because they process each character in the string."
+points = { time_complexity_mentioned = 0.5, asymptotically_equivalent = 0 }
diff --git a/src/tests/resources/llm/invalid-model-family/grading/post.md b/src/tests/resources/llm/invalid-model-family/grading/post.md
@@ -0,0 +1 @@
+The student answer will not contain any code. This is expected, since you only need to grade the explanation according to the rubrics!
diff --git a/src/tests/resources/llm/invalid-model-family/grading/tests.py b/src/tests/resources/llm/invalid-model-family/grading/tests.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+
+# Scaffolding necessary to set up ACCESS test
+import sys
+try: from universal.harness import *
+except: sys.path.append("../../universal/"); from harness import *
+
+# Grading test suite starts here
+
+script = grading_import("task", "script")
+
+class GradingTests(AccessTestCase):
+
+    def _test(self, sentence, expected):
+        actual = script.reverse_words(sentence)
+        self.hint(f"Reversal not correct for sentence='{sentence}'... expected result is '{expected}'!")
+        self.assertEqual(expected, actual)
+
+    def test_case1(self):
+        self._test("Hello World", "World Hello")
+
+    def test_case2(self):
+        self._test("  This   is  a  test  ", "test a is This")
+
+    def test_case3(self):
+        self._test("Python", "Python")
+
+    def test_case4(self):
+        self._test("", "")
+
+    def test_case5(self):
+        self._test("Hello, World!", "World! Hello,")
+
+    def test_case6(self):
+        self._test("123 456 789", "789 456 123")
+
+TestRunner().run(AccessTestSuite(1, [GradingTests]))
diff --git a/src/tests/resources/llm/invalid-model-family/instructions_en.md b/src/tests/resources/llm/invalid-model-family/instructions_en.md
@@ -0,0 +1,11 @@
+# String Manipulation in Python
+
+## Task Description
+
+Your task is to implement a Python function called `reverse_words`. This function should take a single string input and return a string with the words in reverse order. For example:
+
+- Input: `"Hello World"`
+- Output: `"World Hello"`
+
+Additionally, reflect on the differences in complexity between reversing the order of words in a sentence and reversing the characters within each word in the `explanation.md` file provided. 
+Which operation do you think is more computationally efficient, and why? Consider factors such as string manipulation methods and time complexity in your explanation. Provide examples to support your reasoning.
diff --git a/src/tests/resources/llm/invalid-model-family/rubrics/rubrics.toml b/src/tests/resources/llm/invalid-model-family/rubrics/rubrics.toml
@@ -0,0 +1,9 @@
+[[rubrics]]
+id = "time_complexity_mentioned"
+title = "Mentioned the time complexity of both operations is O(n)"
+points = 0.5
+
+[[rubrics]]
+id = "asymptotically_equivalent"
+title = "Explained that both are asymptotically equivalent but in practice, reversing characters is slower"
+points = 0.5
diff --git a/src/tests/resources/llm/invalid-model-family/solution/explanation.md b/src/tests/resources/llm/invalid-model-family/solution/explanation.md
@@ -0,0 +1,16 @@
+Reversing the order of words in a sentence and reversing the characters within each word have different complexities.
+
+ Time Complexity of Both Operations:
+     Reversing the word order involves splitting the string into a list of words O(n), reversing the list(O(n), and joining it back into a string O(n). This results in an overall complexity of O(n).
+     Reversing characters within each word requires iterating through each word and reversing it (O(m) per word, where m is the word length). Since this must be done for all words, the total complexity remains O(n).
+
+ Which is More Efficient and Why:
+     Both operations have an O(n) complexity, but reversing words is generally more efficient in practice because it operates at a higher level (list reversal), whereas reversing characters requires more fine-grained string manipulation.
+     If implemented using in-place reversal, reversing characters within each word can introduce additional overhead compared to simple list manipulation.
+
+Example:
+
+    "Hello World" → "World Hello" (word order reversal)
+    "Hello World" → "olleH dlroW" (character reversal)
+
+While both methods scale similarly, reversing characters involves additional operations per word, making it slightly more complex in practical execution.
diff --git a/src/tests/resources/llm/invalid-model-family/solution/script.py b/src/tests/resources/llm/invalid-model-family/solution/script.py
@@ -0,0 +1,3 @@
+def reverse_words(sentence):
+    # Split the sentence into words, reverse the list, and join it back into a string
+    return ' '.join(sentence.split()[::-1])
diff --git a/src/tests/resources/llm/invalid-model-family/task/explanation.md b/src/tests/resources/llm/invalid-model-family/task/explanation.md
@@ -0,0 +1 @@
+[comment]: <> (Add your solution here:)
diff --git a/src/tests/resources/llm/invalid-model-family/task/script.py b/src/tests/resources/llm/invalid-model-family/task/script.py
@@ -0,0 +1,7 @@
+# Task: Implement a function `reverse_words` that takes a string
+# and returns the string with the order of words reversed.
+# Example: "Hello World" -> "World Hello"
+
+def reverse_words(sentence):
+    # TODO: Implement this function
+    pass
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		The student answer will not contain any code. This is expected, since you only need to grade the explanation according to the rubrics!