Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ __pycache__/
# C extensions
*.so


# Distribution / packaging
.Python
build/
Expand Down Expand Up @@ -76,5 +77,8 @@ tags
# Persistent undo
[._]*.un~

# Local
.idea/

# End of https://www.toptal.com/developers/gitignore/api/vim,python

19 changes: 11 additions & 8 deletions src/access_cli_sealuzh/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,6 @@

def main():
from access_cli_sealuzh.main import AccessValidator, autodetect

# TODO GBAI:
# * add option to pass API KEY
# * add additional options the service might need (model selection?)
# * by default, the AI service should stop after validation, add an option
# to keep the service running
# * ensure that there is some combination of options such that ONLY the AI
# grading is executed, since TAs will be relying on it to design the task
parser = argparse.ArgumentParser(
prog = 'access-cli',
description = 'Validate ACCESS course configurations using the CLI')
Expand Down Expand Up @@ -49,6 +41,17 @@ def main():
help = "recurse into nested structures (assignments/tasks) if applicable")
parser.add_argument('-A', '--auto-detect', action='store_true', default=False,
help = "attempt to auto-detect what is being validated")
parser.add_argument('-k', '--llm-api-key', type=str,
help = "API key for the LLM service. Create one at the corresponding service provider.")
parser.add_argument('-K', '--llm-keep-service', action=argparse.BooleanOptionalAction,
help = "Keep LLM service running after validation for further grading")
parser.add_argument('-L', '--llm-only', action=argparse.BooleanOptionalAction,
help = "Only run LLM grading (for TAs designing tasks)")
parser.add_argument('-U', '--assistant-url',
default="http://localhost:4000",
help = "URL of the assistant service")
parser.add_argument('-M', '--llm-model', type=str,
help = "Model to use for LLM grading")
args = parser.parse_args()


Expand Down
323 changes: 311 additions & 12 deletions src/access_cli_sealuzh/main.py

Large diffs are not rendered by default.

20 changes: 18 additions & 2 deletions src/access_cli_sealuzh/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,9 @@
# - if each file in files actually exists
# - that none of the grading or solution files are editable or visible
# - that editable files are also visible
# - that if you use llm evaluation, all defined files in the config are also present as files (submission, rubrics, examples, solution)
# - OPTIONALLY: that the run, test and grade commands execute correctly
task_schema = { # TODO GBAI: add additional fields
task_schema = {
"slug": {'required': True, 'type': 'string'},
"authors": {'required': False, 'type': 'list',
'schema': {'type': 'string'}},
Expand All @@ -102,6 +103,21 @@
'schema': {'type': 'string'}},
"persist": {'required': False, 'type': 'list',
'schema': {'type': 'string'}}
}}
}},
"llm": {'required': False, 'type': 'dict', 'schema': {
'submission': {'required': True, 'type': 'string'},
'rubrics': {'required': False, 'type': 'string'},
'examples': {'required': False, 'type': 'string'},
'solution': {'required': False, 'type': 'string'},
'cot': {'required': False, 'type': 'boolean'},
'voting': {'required': False, 'type': 'integer'},
'post': {'required': False, 'type': 'string'},
'pre': {'required': False, 'type': 'string'},
'prompt': {'required': False, 'type': 'string'},
'temperature': {'required': False, 'type': 'float'},
'model': {'required': False, 'type': 'string'},
'max_points': {'required': False, 'type': 'float'},
'model_family': {'required': False, 'type': 'string'}
}},
}

48 changes: 48 additions & 0 deletions src/tests/resources/llm/complete-config/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
slug = "string_manipulation"


max_attempts = 6
refill = 43200 # 12 hours
max_points = 2

[information.en]
title = "String Manipulation in Python"
instructions_file = "instructions_en.md"

[evaluator]
docker_image = "python:latest"
run_command = "python -m task.script"
test_command = "python -m unittest discover -v task"
grade_command = "python -m grading.tests"

[llm]
submission = "task/explanation.md"
rubrics = 'rubrics/rubrics.toml' # This file contains the rubrics for the task to guide the model
examples = 'grading/examples.toml' # This file contains examples of the task for the model to learn from
solution = 'solution/explanation.md' # This is the solution file used to guide the model
cot = true # This will add "think step by step" to the context prompt in order to encourage the model to think step by step
voting = 3 # This allows for the results to be evaluated 3 times, and the most common result is chosen. This setting will increase the time it takes to grade the task!
post = "grading/post.md" # Adds further instruction at the end of the context prompt
# pre = "grading/pre.md" # Adds further instruction in front of the context prompt
# prompt = "grading/prompt.md" # replaces the context prompt with the content of the file. This is only for advanced usages!
temperature = 0.2 # Decides the randomness of the gpt model
model_family = "claude" # gpt or claude
model = "claude-3-5-sonnet-latest" # gpt-4o, gpt-4o-mini, claude-3-5-sonnet-latest, etc.
max_points = 1 # Max points for the sub-task that is passed to the model

[files]
visible = [
"task/script.py",
"task/explanation.md",
]
editable = [
"task/script.py",
"task/explanation.md",
]
grading = [
"grading/tests.py",
]
solution = [
"solution/script.py",
"solution/explanation.md",
]
7 changes: 7 additions & 0 deletions src/tests/resources/llm/complete-config/grading/examples.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[[examples]]
answer = "Reversing word order and reversing characters both have O(n) complexity, but character reversal requires more operations per word, making it slightly less efficient in practice."
points = { time_complexity_mentioned = 0.5, asymptotically_equivalent = 0.5 }

[[examples]]
answer = "Both operations have O(n) complexity because they process each character in the string."
points = { time_complexity_mentioned = 0.5, asymptotically_equivalent = 0 }
1 change: 1 addition & 0 deletions src/tests/resources/llm/complete-config/grading/post.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The student answer will not contain any code. This is expected, since you only need to grade the explanation according to the rubrics!
37 changes: 37 additions & 0 deletions src/tests/resources/llm/complete-config/grading/tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env python3

# Scaffolding necessary to set up ACCESS test
import sys
try: from universal.harness import *
except: sys.path.append("../../universal/"); from harness import *

# Grading test suite starts here

script = grading_import("task", "script")

class GradingTests(AccessTestCase):

def _test(self, sentence, expected):
actual = script.reverse_words(sentence)
self.hint(f"Reversal not correct for sentence='{sentence}'... expected result is '{expected}'!")
self.assertEqual(expected, actual)

def test_case1(self):
self._test("Hello World", "World Hello")

def test_case2(self):
self._test(" This is a test ", "test a is This")

def test_case3(self):
self._test("Python", "Python")

def test_case4(self):
self._test("", "")

def test_case5(self):
self._test("Hello, World!", "World! Hello,")

def test_case6(self):
self._test("123 456 789", "789 456 123")

TestRunner().run(AccessTestSuite(1, [GradingTests]))
11 changes: 11 additions & 0 deletions src/tests/resources/llm/complete-config/instructions_en.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# String Manipulation in Python

## Task Description

Your task is to implement a Python function called `reverse_words`. This function should take a single string input and return a string with the words in reverse order. For example:

- Input: `"Hello World"`
- Output: `"World Hello"`

Additionally, reflect on the differences in complexity between reversing the order of words in a sentence and reversing the characters within each word in the `explanation.md` file provided.
Which operation do you think is more computationally efficient, and why? Consider factors such as string manipulation methods and time complexity in your explanation. Provide examples to support your reasoning.
9 changes: 9 additions & 0 deletions src/tests/resources/llm/complete-config/rubrics/rubrics.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[[rubrics]]
id = "time_complexity_mentioned"
title = "Mentioned the time complexity of both operations is O(n)"
points = 0.5

[[rubrics]]
id = "asymptotically_equivalent"
title = "Explained that both are asymptotically equivalent but in practice, reversing characters is slower"
points = 0.5
16 changes: 16 additions & 0 deletions src/tests/resources/llm/complete-config/solution/explanation.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Reversing the order of words in a sentence and reversing the characters within each word have different complexities.

Time Complexity of Both Operations:
Reversing the word order involves splitting the string into a list of words O(n), reversing the list(O(n), and joining it back into a string O(n). This results in an overall complexity of O(n).
Reversing characters within each word requires iterating through each word and reversing it (O(m) per word, where m is the word length). Since this must be done for all words, the total complexity remains O(n).

Which is More Efficient and Why:
Both operations have an O(n) complexity, but reversing words is generally more efficient in practice because it operates at a higher level (list reversal), whereas reversing characters requires more fine-grained string manipulation.
If implemented using in-place reversal, reversing characters within each word can introduce additional overhead compared to simple list manipulation.

Example:

"Hello World" → "World Hello" (word order reversal)
"Hello World" → "olleH dlroW" (character reversal)

While both methods scale similarly, reversing characters involves additional operations per word, making it slightly more complex in practical execution.
3 changes: 3 additions & 0 deletions src/tests/resources/llm/complete-config/solution/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
def reverse_words(sentence):
# Split the sentence into words, reverse the list, and join it back into a string
return ' '.join(sentence.split()[::-1])
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[comment]: <> (Add your solution here:)
7 changes: 7 additions & 0 deletions src/tests/resources/llm/complete-config/task/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Task: Implement a function `reverse_words` that takes a string
# and returns the string with the order of words reversed.
# Example: "Hello World" -> "World Hello"

def reverse_words(sentence):
# TODO: Implement this function
pass
48 changes: 48 additions & 0 deletions src/tests/resources/llm/invalid-model-family/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
slug = "string_manipulation"


max_attempts = 6
refill = 43200 # 12 hours
max_points = 2

[information.en]
title = "String Manipulation in Python"
instructions_file = "instructions_en.md"

[evaluator]
docker_image = "python:latest"
run_command = "python -m task.script"
test_command = "python -m unittest discover -v task"
grade_command = "python -m grading.tests"

[llm]
submission = "task/explanation.md"
rubrics = 'rubrics/rubrics.toml' # This file contains the rubrics for the task to guide the model
examples = 'grading/examples.toml' # This file contains examples of the task for the model to learn from
solution = 'solution/explanation.md' # This is the solution file used to guide the model
cot = true # This will add "think step by step" to the context prompt in order to encourage the model to think step by step
voting = 3 # This allows for the results to be evaluated 3 times, and the most common result is chosen. This setting will increase the time it takes to grade the task!
post = "grading/post.md" # Adds further instruction at the end of the context prompt
# pre = "grading/pre.md" # Adds further instruction in front of the context prompt
# prompt = "grading/prompt.md" # replaces the context prompt with the content of the file. This is only for advanced usages!
temperature = 0.2 # Decides the randomness of the gpt model
model_family = "invalid" # This is invalid and should trigger an error
model = "claude-3-5-sonnet-latest" # gpt-4o, gpt-4o-mini, claude-3-5-sonnet-latest, etc.
max_points = 1 # Max points for the sub-task that is passed to the model

[files]
visible = [
"task/script.py",
"task/explanation.md",
]
editable = [
"task/script.py",
"task/explanation.md",
]
grading = [
"grading/tests.py",
]
solution = [
"solution/script.py",
"solution/explanation.md",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[[examples]]
answer = "Reversing word order and reversing characters both have O(n) complexity, but character reversal requires more operations per word, making it slightly less efficient in practice."
points = { time_complexity_mentioned = 0.5, asymptotically_equivalent = 0.5 }

[[examples]]
answer = "Both operations have O(n) complexity because they process each character in the string."
points = { time_complexity_mentioned = 0.5, asymptotically_equivalent = 0 }
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The student answer will not contain any code. This is expected, since you only need to grade the explanation according to the rubrics!
37 changes: 37 additions & 0 deletions src/tests/resources/llm/invalid-model-family/grading/tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env python3

# Scaffolding necessary to set up ACCESS test
import sys
try: from universal.harness import *
except: sys.path.append("../../universal/"); from harness import *

# Grading test suite starts here

script = grading_import("task", "script")

class GradingTests(AccessTestCase):

def _test(self, sentence, expected):
actual = script.reverse_words(sentence)
self.hint(f"Reversal not correct for sentence='{sentence}'... expected result is '{expected}'!")
self.assertEqual(expected, actual)

def test_case1(self):
self._test("Hello World", "World Hello")

def test_case2(self):
self._test(" This is a test ", "test a is This")

def test_case3(self):
self._test("Python", "Python")

def test_case4(self):
self._test("", "")

def test_case5(self):
self._test("Hello, World!", "World! Hello,")

def test_case6(self):
self._test("123 456 789", "789 456 123")

TestRunner().run(AccessTestSuite(1, [GradingTests]))
11 changes: 11 additions & 0 deletions src/tests/resources/llm/invalid-model-family/instructions_en.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# String Manipulation in Python

## Task Description

Your task is to implement a Python function called `reverse_words`. This function should take a single string input and return a string with the words in reverse order. For example:

- Input: `"Hello World"`
- Output: `"World Hello"`

Additionally, reflect on the differences in complexity between reversing the order of words in a sentence and reversing the characters within each word in the `explanation.md` file provided.
Which operation do you think is more computationally efficient, and why? Consider factors such as string manipulation methods and time complexity in your explanation. Provide examples to support your reasoning.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[[rubrics]]
id = "time_complexity_mentioned"
title = "Mentioned the time complexity of both operations is O(n)"
points = 0.5

[[rubrics]]
id = "asymptotically_equivalent"
title = "Explained that both are asymptotically equivalent but in practice, reversing characters is slower"
points = 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Reversing the order of words in a sentence and reversing the characters within each word have different complexities.

Time Complexity of Both Operations:
Reversing the word order involves splitting the string into a list of words O(n), reversing the list(O(n), and joining it back into a string O(n). This results in an overall complexity of O(n).
Reversing characters within each word requires iterating through each word and reversing it (O(m) per word, where m is the word length). Since this must be done for all words, the total complexity remains O(n).

Which is More Efficient and Why:
Both operations have an O(n) complexity, but reversing words is generally more efficient in practice because it operates at a higher level (list reversal), whereas reversing characters requires more fine-grained string manipulation.
If implemented using in-place reversal, reversing characters within each word can introduce additional overhead compared to simple list manipulation.

Example:

"Hello World" → "World Hello" (word order reversal)
"Hello World" → "olleH dlroW" (character reversal)

While both methods scale similarly, reversing characters involves additional operations per word, making it slightly more complex in practical execution.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
def reverse_words(sentence):
# Split the sentence into words, reverse the list, and join it back into a string
return ' '.join(sentence.split()[::-1])
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[comment]: <> (Add your solution here:)
7 changes: 7 additions & 0 deletions src/tests/resources/llm/invalid-model-family/task/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Task: Implement a function `reverse_words` that takes a string
# and returns the string with the order of words reversed.
# Example: "Hello World" -> "World Hello"

def reverse_words(sentence):
# TODO: Implement this function
pass
Loading