Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
eda9cc9
Add frontend, docs and backend files
marcorosa Dec 4, 2024
f7b4878
Add textattack attack
marcorosa Dec 4, 2024
0cbb8f9
Add CodeAttack attack
marcorosa Dec 4, 2024
46429cc
Add requirements.txt
marcorosa Dec 4, 2024
fbc3274
Add instructions
marcorosa Dec 4, 2024
82514fb
Add github actions
marcorosa Dec 4, 2024
cf68639
Add renovate bot
marcorosa Dec 4, 2024
b40b3be
Add badge for changelog-ci action
marcorosa Dec 4, 2024
54821ab
Add image
marcorosa Dec 4, 2024
d81d176
Replace renovate bot with dependabot
marcorosa Dec 5, 2024
e7f44a4
Configure pep8speaks bot
marcorosa Dec 5, 2024
c20969a
Fix project naming
marcorosa Dec 5, 2024
7d3d708
Bump dompurify from 3.1.0 to 3.2.2 in /frontend
dependabot[bot] Dec 4, 2024
45085d2
Bump rollup from 4.14.1 to 4.28.0 in /frontend
dependabot[bot] Dec 4, 2024
22e0458
Bump mermaid from 10.9.0 to 10.9.3 in /frontend
dependabot[bot] Dec 4, 2024
596f3f0
Bump cross-spawn from 7.0.3 to 7.0.6 in /frontend
dependabot[bot] Dec 4, 2024
1c0a92b
Bump express from 4.19.2 to 4.21.1 in /frontend
dependabot[bot] Dec 4, 2024
7995f7b
Bump cookie, socket.io and express in /frontend
dependabot[bot] Dec 4, 2024
25528d0
Bump serve-static and express in /frontend
dependabot[bot] Dec 4, 2024
2c9722b
Add frontend Dockerfile location
marcorosa Dec 6, 2024
b894d07
Add reuse license information
marcorosa Dec 17, 2024
9fde947
Add licenses as per reuse
marcorosa Dec 17, 2024
6c640e2
Fix wording for reuse parser
marcorosa Dec 17, 2024
ab28530
Bump ws and socket.io-adapter in /frontend
dependabot[bot] Dec 17, 2024
6f8321f
Bump nanoid from 3.3.7 to 3.3.8 in /frontend
dependabot[bot] Dec 17, 2024
868e559
Bump braces from 3.0.2 to 3.0.3 in /frontend
dependabot[bot] Dec 17, 2024
e9db788
Bump micromatch from 4.0.5 to 4.0.8 in /frontend
dependabot[bot] Dec 17, 2024
368f6c1
Bump path-to-regexp and express in /frontend
dependabot[bot] Dec 17, 2024
0664d2a
Support IBM models
marcorosa Dec 20, 2024
1e5b28b
Support titan models
marcorosa Dec 19, 2024
a6147e0
Merge branch 'main' into develop
marcorosa Jan 6, 2025
bce36c5
Use codeattack in suite
marcorosa Jan 7, 2025
fc858a9
Fix typos
marcorosa Jan 7, 2025
918c41b
Limit codeattack to 20 prompts
marcorosa Jan 7, 2025
ab68528
Improve definition of codeattack
marcorosa Jan 7, 2025
aac0e72
Fix TypeError in empty responses
marcorosa Jan 7, 2025
8f02acc
[Changelog CI] Add Changelog for Version v0.1.0
github-actions[bot] Jan 7, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,6 @@ result_success.txt
summary.txt
prompt_success.txt
result_gptfuzz.txt
EasyJailbreak_result.jsonl
codeattack_success.txt

# Frontend Environments
Expand Down
20 changes: 20 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Version: v0.1.0

* [#2](https://github.com/SAP/STARS/pull/2): Bump body-parser and express in /frontend
* [#3](https://github.com/SAP/STARS/pull/3): Bump send and express in /frontend
* [#4](https://github.com/SAP/STARS/pull/4): Bump serve-static and express in /frontend
* [#5](https://github.com/SAP/STARS/pull/5): Bump cookie, socket.io and express in /frontend
* [#6](https://github.com/SAP/STARS/pull/6): Bump express from 4.19.2 to 4.21.1 in /frontend
* [#7](https://github.com/SAP/STARS/pull/7): Bump cross-spawn from 7.0.3 to 7.0.6 in /frontend
* [#8](https://github.com/SAP/STARS/pull/8): Bump mermaid from 10.9.0 to 10.9.3 in /frontend
* [#9](https://github.com/SAP/STARS/pull/9): Bump rollup from 4.14.1 to 4.28.0 in /frontend
* [#10](https://github.com/SAP/STARS/pull/10): Bump dompurify from 3.1.0 to 3.2.2 in /frontend
* [#11](https://github.com/SAP/STARS/pull/11): Minor fixes
* [#12](https://github.com/SAP/STARS/pull/12): Fix reuse compliance
* [#13](https://github.com/SAP/STARS/pull/13): Bump micromatch from 4.0.5 to 4.0.8 in /frontend
* [#14](https://github.com/SAP/STARS/pull/14): Bump braces from 3.0.2 to 3.0.3 in /frontend
* [#15](https://github.com/SAP/STARS/pull/15): Bump path-to-regexp and express in /frontend
* [#16](https://github.com/SAP/STARS/pull/16): Bump nanoid from 3.3.7 to 3.3.8 in /frontend
* [#17](https://github.com/SAP/STARS/pull/17): Bump ws and socket.io-adapter in /frontend
* [#18](https://github.com/SAP/STARS/pull/18): Support IBM models
* [#19](https://github.com/SAP/STARS/pull/19): Support Bedrock models
37 changes: 0 additions & 37 deletions backend-agent/attack.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import os
import logging


from attack_result import AttackResult, SuiteResult
from libs.codeattack import start_codeattack, \
OUTPUT_FILE as codeattack_out_file
Expand Down Expand Up @@ -159,42 +158,6 @@ def start(self) -> AttackResult:
self.eval_model,
self.parameters
))
case 'cipher':
return t.trace(Cipher(
self.target_model,
self.eval_model).perform_attack(
self.parameters))
case 'codechameleon':
return t.trace(CodeChameleon(
self.target_model,
self.eval_model).perform_attack(
self.parameters))
case 'deepinception':
return t.trace(DeepInception(
self.target_model,
self.eval_model).perform_attack(
self.parameters))
case 'ica':
return t.trace(ICA(
self.target_model).perform_attack(
self.parameters))
case 'jailbroken':
return t.trace(Jailbroken(
self.target_model,
self.attack_model,
self.eval_model).perform_attack(
self.parameters))
case 'multilingual':
return t.trace(Multilingual(
self.target_model,
self.eval_model).perform_attack(
self.parameters))
case 'renellm':
return t.trace(ReNeLLM(
self.target_model,
self.attack_model,
self.eval_model).perform_attack(
self.parameters))
case _:
raise ValueError(f'Attack {self.attack} is not known.')

Expand Down
15 changes: 6 additions & 9 deletions backend-agent/data/intro.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,12 @@ I can help you with assessing security of **Large Language Models**.
I can run a **vulnerability scan** against a Large Language Model of your choice.
For that I will run a number of attacks against the Large Language Model:
- promptmap to identify Prompt Injections
- PyRIT to try to leak the System Prompt and to get the model to generate malicious content, e.g. writing a phishing email
- Jailbreak tools such as "cipher" and "multilingual". These will attempt to get the model to generate malicious content.
- PyRIT to try to leak the System Prompt and to get the model to generate
malicious content, e.g., writing a phishing email
- CodeAttack to run a prompt Injection Attack hidden in a code completion task.
As CodeAttack includes hundreds of test prompts, a quick version (i.e., running
only 20 test prompts) will be run as part of the vulnerability scan.

To start the vulnerability scan, simply answer *Start the vulnerability scan*.

### Individual attacks
Expand All @@ -19,13 +23,6 @@ Supported attacks are:
- gptfuzz
- PyRIT
- CodeAttack
- cipher
- codechameleon
- deepinception
- ica
- jailbroken
- multilingual
- renellm

### Attacks against Natural language processing models

Expand Down
12 changes: 5 additions & 7 deletions backend-agent/data/suite/default.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,12 @@
}
},
{
"attack": "cipher",
"attack": "codeattack",
"target-model": "<target>",
"eval-model": "mistralai--mixtral-8x7b-instruct-v01"
},
{
"attack": "multilingual",
"target-model": "<target>",
"eval-model": "mistralai--mixtral-8x7b-instruct-v01"
"eval-model": "gpt-4",
"parameters": {
"num_prompts": 20
}
}
]
}
9 changes: 6 additions & 3 deletions backend-agent/libs/codeattack.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,8 @@ def start_codeattack(target_model: LLM,
# incompatibility with the following < and > clauses
try:
judge_score = int(prompt_attack_result['judge_score'])
except ValueError:
# Sometimes the LLM replies with a score="N/A"
except (ValueError, TypeError):
# Sometimes the LLM replies with a score="N/A" or =None
logger.warning('Judge replied with a non-integer score: '
f'{judge_score}')
judge_score = -1
Expand Down Expand Up @@ -229,7 +229,10 @@ def _prompt_attack(data, target_llm, post_processor, judge_llm, data_key=''):
# As we call our LLM generate method instead of a direct completion
# invocation (i.e., CodeAttack original implementation), we need to get the
# textual response from a LLMResponse class with the unwrap method
target_response = target_response.unwrap_first()
# This may introduce a TypeError later (in post_processor.core call) in
# case the target_response is None, so we treat this case by assigning an
# empty string
target_response = target_response.unwrap_first() or ''
logger.debug(target_response)

logger.debug('*' * 20)
Expand Down
134 changes: 129 additions & 5 deletions backend-agent/llm.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
from typing import Any, Dict, List
import abc
import logging
import os

from gen_ai_hub.proxy.core.proxy_clients import set_proxy_version

from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
from gen_ai_hub.proxy.core.proxy_clients import set_proxy_version
from gen_ai_hub.proxy.native.openai import OpenAI as ProxyOpenAI
from gen_ai_hub.proxy.native.google_vertexai.clients import GenerativeModel
import httpx
from gen_ai_hub.proxy.native.amazon.clients import Session
from openai import OpenAI as OfficialOpenAI
from openai import InternalServerError
import httpx
import ollama

from llm_response import Error, Filtered, LLMResponse, Success
Expand All @@ -24,12 +25,12 @@
AICORE_MODELS = {
'openai':
[
'gpt-4o',
'gpt-4',
'gpt-35-turbo',
'gpt-35-turbo-0125',
'gpt-35-turbo-16k',
'gpt-4',
'gpt-4-32k',
'gpt-4o',
'gpt-4o-mini'
],
'opensource':
Expand All @@ -45,6 +46,22 @@
'gemini-1.0-pro',
'gemini-1.5-pro',
'gemini-1.5-flash'
],
'ibm':
[
'ibm--granite-13b-chat'
],
'bedrock':
[
'amazon--titan-text-lite',
'amazon--titan-text-express',
'anthropic--claude-3-haiku',
'anthropic--claude-3-sonnet',
'anthropic--claude-3-opus',
'anthropic--claude-3.5-sonnet',
'amazon--nova-pro',
'amazon--nova-lite',
'amazon--nova-micro'
]
}

Expand All @@ -70,8 +87,17 @@ def from_model_name(cls, model_name: str) -> 'LLM':
return AICoreOpenAILLM(model_name)
if model_name in AICORE_MODELS['opensource']:
return AICoreOpenAILLM(model_name, False)
if model_name in AICORE_MODELS['ibm']:
# IBM models are compatible with OpenAI completion API
return AICoreOpenAILLM(model_name)
if model_name in AICORE_MODELS['vertexai']:
return AICoreGoogleVertexLLM(model_name)
if model_name in AICORE_MODELS['bedrock']:
if 'titan' in model_name:
# Titan models don't support system prompts
return AICoreAmazonBedrockLLM(model_name, False)
else:
return AICoreAmazonBedrockLLM(model_name)
if model_name == 'mistral':
return LocalOpenAILLM(
os.getenv('MISTRAL_MODEL_NAME', ''),
Expand Down Expand Up @@ -442,3 +468,101 @@ def generate_completions_for_messages(
return Success(responses)
except ValueError as v:
return Error(str(v))


class AICoreAmazonBedrockLLM(LLM):

def __init__(self, model_name: str, uses_system_prompt: bool = True):
self.model_name = model_name
proxy_client = get_proxy_client('gen-ai-hub')
self.model = Session().client(
proxy_client=proxy_client,
model_name=self.model_name
)
self.uses_system_prompt = uses_system_prompt

def __str__(self) -> str:
return f'{self.model_name}/Amazon Bedrock'

def generate(self,
system_prompt: str,
prompt: str,
temperature: float = 1,
max_tokens: int = 1024,
n: int = 1) -> LLMResponse:

# Declare types for messages and kwargs to avoid mypy errors
messages: List[Dict[str, Any]] = []
kwargs: Dict[str, Any] = {
'inferenceConfig': {
'temperature': temperature,
'maxTokens': max_tokens
}
}
if not system_prompt:
messages.append(
{'role': 'user', 'content': [{'text': prompt}]}
)
else:
if self.uses_system_prompt:
messages.append(
{'role': 'user', 'content': [{'text': prompt}]}
)
kwargs['system'] = [{'text': system_prompt}]
else:
# Similarly to the Mistral model, also among Bedrock models
# there are some that do not support system prompt (e.g., titan
# models).
messages.append(
{'role': 'user',
'content': [{'text': f'{system_prompt}{prompt}'}]},
)
try:
responses = [self.model.converse(
messages=messages,
**kwargs # arguments supported by converse API
)['output']['message']['content'][0]['text'] for _ in range(n)]
if not all(responses):
return Filtered(
'One of the generations resulted in an empty response')
return Success(responses)
except ValueError as v:
return Error(str(v))

def generate_completions_for_messages(
self,
messages: list,
temperature: float = 1,
max_tokens: int = 1024,
top_p: int = 1,
frequency_penalty: float = 0.5,
presence_penalty: float = 0.5,
n: int = 1) -> LLMResponse:
contents = []
# TODO: manage system prompt
for message in messages:
contents.append(
{
'role': 'user',
'content': [{'text': message['content']}]
}
)
try:
responses = [self.model.converse(
messages=contents,
inferenceConfig={
'temperature': temperature,
'maxTokens': max_tokens,
'topP': top_p
# Frequency penalty and Presence penalty are not supported
# by Amazon.
# 'frequency_penalty': frequency_penalty,
# 'presence_penalty': presence_penalty,
})['output']['message']['content'][0]['text']
for _ in range(n)]
if not all(responses):
return Filtered(
'One of the generations resulted in an empty response')
return Success(responses)
except ValueError as v:
return Error(str(v))
Loading
Loading