From 5fadc676029a9f226a551a07a65becf129776d83 Mon Sep 17 00:00:00 2001 From: Martin <1224973+mavaa@users.noreply.github.com> Date: Fri, 24 May 2024 03:06:31 +0200 Subject: [PATCH 1/8] Some files for testing llm4decompile --- codebleu_example_cases.py | 23 +++++++++++++++++++ requirements_llm4decompile.txt | 7 ++++++ test_llm4decompile.py | 19 ++++++++++++++++ transform_cases_for_llm4decompile.py | 34 ++++++++++++++++++++++++++++ 4 files changed, 83 insertions(+) create mode 100644 codebleu_example_cases.py create mode 100644 requirements_llm4decompile.txt create mode 100644 test_llm4decompile.py create mode 100644 transform_cases_for_llm4decompile.py diff --git a/codebleu_example_cases.py b/codebleu_example_cases.py new file mode 100644 index 0000000..03c47a1 --- /dev/null +++ b/codebleu_example_cases.py @@ -0,0 +1,23 @@ +import os +from codebleu import calc_codebleu + +def read_code_from_file(file_path): + with open(file_path, 'r') as file: + code = file.read() + return code + +if __name__ == '__main__': + base_path="data_codebleu_examples" + reference_file=os.path.join(base_path, "reference.txt") + + reference_code = read_code_from_file(reference_file) + + for i in range(4): + filename=f'src{i+1}.txt' + prediction_code = read_code_from_file(os.path.join(base_path, filename)) + bleu = calc_codebleu([reference_code], [prediction_code], lang="c", weights=(0.25, 0.25, 0.25, 0.25), tokenizer=None) + + + print(f"Results for {filename}:") + for key, value in bleu.items(): + print(f"{key}: {value:.2%}") diff --git a/requirements_llm4decompile.txt b/requirements_llm4decompile.txt new file mode 100644 index 0000000..ce1dd92 --- /dev/null +++ b/requirements_llm4decompile.txt @@ -0,0 +1,7 @@ +tqdm +transformers +loguru +tqdm +text_generation +vllm +flash-attn diff --git a/test_llm4decompile.py b/test_llm4decompile.py new file mode 100644 index 0000000..a4f277b --- /dev/null +++ b/test_llm4decompile.py @@ -0,0 +1,19 @@ +from transformers import AutoTokenizer, AutoModelForCausalLM +import torch + +model_path = 'LLM4Binary/llm4decompile-6.7b-v1.5' # V1.5 Model +tokenizer = AutoTokenizer.from_pretrained(model_path) +model = AutoModelForCausalLM.from_pretrained(model_path,torch_dtype=torch.bfloat16).cuda() + +with open("data_decompile_eval/llm4decompileprompts/task_000_d.txt_prompt.txt_.asm",'r') as f:#optimization level O0 + asm_func = f.read() +inputs = tokenizer(asm_func, return_tensors="pt").to(model.device) +with torch.no_grad(): + outputs = model.generate(**inputs, max_new_tokens=2048)### max length to 4096, max new tokens should be below the range +c_func_decompile = tokenizer.decode(outputs[0][len(inputs[0]):-1]) + +with open(fileName +'.c','r') as f:#original file + func = f.read() + +print(f'original function:\n{func}')# Note we only decompile one function, where the original file may contain multiple functions +print(f'decompiled function:\n{c_func_decompile}') diff --git a/transform_cases_for_llm4decompile.py b/transform_cases_for_llm4decompile.py new file mode 100644 index 0000000..50690b0 --- /dev/null +++ b/transform_cases_for_llm4decompile.py @@ -0,0 +1,34 @@ +import os +from src.util import create_folder_if_not_exists + +base_dir = "data_decompile_eval/disassemblies" +prompt_folder = os.path.join('data_decompile_eval', 'llm4decompileprompts') +create_folder_if_not_exists(prompt_folder) + +for output_file in sorted(os.listdir(base_dir)): + prompt_filename = os.path.join(prompt_folder, f"{output_file}_prompt.txt") + input_asm = '' + file_path = os.path.join(base_dir, output_file) + print(f"reading {file_path}") + with open(file_path) as f:#asm file + asm= f.read() + if '<'+'.text'+'>:' not in asm: #IMPORTANT replace func0 with the function name + raise ValueError("compile fails") + asm = '<'+'.text'+'>:' + asm.split('<'+'.text'+'>:')[-1].split('\n\n')[0] #IMPORTANT replace func0 with the function name + asm_clean = "" + asm_sp = asm.split("\n") + for tmp in asm_sp: + if len(tmp.split("\t"))<3 and '00' in tmp: + continue + idx = min( + len(tmp.split("\t")) - 1, 2 + ) + tmp_asm = "\t".join(tmp.split("\t")[idx:]) # remove the binary code + tmp_asm = tmp_asm.split("#")[0].strip() # remove the comments + asm_clean += tmp_asm + "\n" + input_asm = asm_clean.strip() + before = f"# This is the assembly code:\n"#prompt + after = "\n# What is the source code?\n"#prompt + input_asm_prompt = before+input_asm.strip()+after + with open(prompt_filename +'_' + '.asm','w',encoding='utf-8') as f: + f.write(input_asm_prompt) From 18dc52efb55692ab403ec95edca92261271a129c Mon Sep 17 00:00:00 2001 From: Martin <1224973+mavaa@users.noreply.github.com> Date: Fri, 24 May 2024 09:47:47 +0200 Subject: [PATCH 2/8] codebleu example test --- data_codebleu_examples/reference.txt | 1 + data_codebleu_examples/src1.txt | 1 + data_codebleu_examples/src2.txt | 1 + data_codebleu_examples/src3.txt | 1 + data_codebleu_examples/src4.txt | 1 + 5 files changed, 5 insertions(+) create mode 100644 data_codebleu_examples/reference.txt create mode 100644 data_codebleu_examples/src1.txt create mode 100644 data_codebleu_examples/src2.txt create mode 100644 data_codebleu_examples/src3.txt create mode 100644 data_codebleu_examples/src4.txt diff --git a/data_codebleu_examples/reference.txt b/data_codebleu_examples/reference.txt new file mode 100644 index 0000000..100a612 --- /dev/null +++ b/data_codebleu_examples/reference.txt @@ -0,0 +1 @@ +float trun_num(float num) { return num - (int)num; } diff --git a/data_codebleu_examples/src1.txt b/data_codebleu_examples/src1.txt new file mode 100644 index 0000000..4828ff8 --- /dev/null +++ b/data_codebleu_examples/src1.txt @@ -0,0 +1 @@ +float trun_num(float num) { return (int)num - (int)num; } diff --git a/data_codebleu_examples/src2.txt b/data_codebleu_examples/src2.txt new file mode 100644 index 0000000..b669906 --- /dev/null +++ b/data_codebleu_examples/src2.txt @@ -0,0 +1 @@ +float trun_num(float num) { return num - num; } diff --git a/data_codebleu_examples/src3.txt b/data_codebleu_examples/src3.txt new file mode 100644 index 0000000..ae5db8f --- /dev/null +++ b/data_codebleu_examples/src3.txt @@ -0,0 +1 @@ +float func(float x) { return x - int(x); } diff --git a/data_codebleu_examples/src4.txt b/data_codebleu_examples/src4.txt new file mode 100644 index 0000000..a357103 --- /dev/null +++ b/data_codebleu_examples/src4.txt @@ -0,0 +1 @@ +float func(float f) { int i = (int)f; return f - i; } From 272e88589bd5545b168fbcbdc59e93faceb43785 Mon Sep 17 00:00:00 2001 From: Martin <1224973+mavaa@users.noreply.github.com> Date: Fri, 24 May 2024 19:41:46 +0200 Subject: [PATCH 3/8] Renamed llm4decompile test so that pytest wont try to run it --- test_llm4decompile.py => runtest_llm4decompile.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename test_llm4decompile.py => runtest_llm4decompile.py (100%) diff --git a/test_llm4decompile.py b/runtest_llm4decompile.py similarity index 100% rename from test_llm4decompile.py rename to runtest_llm4decompile.py From ddc22bb8759926db616ec91b111cf22572f59e99 Mon Sep 17 00:00:00 2001 From: Martin <1224973+mavaa@users.noreply.github.com> Date: Fri, 24 May 2024 20:58:44 +0200 Subject: [PATCH 4/8] Had to rename again apparently... --- runtest_llm4decompile.py => run_llm4decompile.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename runtest_llm4decompile.py => run_llm4decompile.py (100%) diff --git a/runtest_llm4decompile.py b/run_llm4decompile.py similarity index 100% rename from runtest_llm4decompile.py rename to run_llm4decompile.py From ddc4acf829fff5d910ddcf9c9314a00a47aca62d Mon Sep 17 00:00:00 2001 From: Martin <1224973+mavaa@users.noreply.github.com> Date: Fri, 24 May 2024 21:03:08 +0200 Subject: [PATCH 5/8] Added if check that apparently fixes stuff? --- run_llm4decompile.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/run_llm4decompile.py b/run_llm4decompile.py index a4f277b..146766f 100644 --- a/run_llm4decompile.py +++ b/run_llm4decompile.py @@ -1,19 +1,20 @@ from transformers import AutoTokenizer, AutoModelForCausalLM import torch -model_path = 'LLM4Binary/llm4decompile-6.7b-v1.5' # V1.5 Model -tokenizer = AutoTokenizer.from_pretrained(model_path) -model = AutoModelForCausalLM.from_pretrained(model_path,torch_dtype=torch.bfloat16).cuda() +if __name__ == '__main__': + model_path = 'LLM4Binary/llm4decompile-6.7b-v1.5' # V1.5 Model + tokenizer = AutoTokenizer.from_pretrained(model_path) + model = AutoModelForCausalLM.from_pretrained(model_path,torch_dtype=torch.bfloat16).cuda() -with open("data_decompile_eval/llm4decompileprompts/task_000_d.txt_prompt.txt_.asm",'r') as f:#optimization level O0 - asm_func = f.read() -inputs = tokenizer(asm_func, return_tensors="pt").to(model.device) -with torch.no_grad(): - outputs = model.generate(**inputs, max_new_tokens=2048)### max length to 4096, max new tokens should be below the range -c_func_decompile = tokenizer.decode(outputs[0][len(inputs[0]):-1]) + with open("data_decompile_eval/llm4decompileprompts/task_000_d.txt_prompt.txt_.asm",'r') as f:#optimization level O0 + asm_func = f.read() + inputs = tokenizer(asm_func, return_tensors="pt").to(model.device) + with torch.no_grad(): + outputs = model.generate(**inputs, max_new_tokens=2048)### max length to 4096, max new tokens should be below the range + c_func_decompile = tokenizer.decode(outputs[0][len(inputs[0]):-1]) -with open(fileName +'.c','r') as f:#original file - func = f.read() + with open(fileName +'.c','r') as f:#original file + func = f.read() -print(f'original function:\n{func}')# Note we only decompile one function, where the original file may contain multiple functions -print(f'decompiled function:\n{c_func_decompile}') + print(f'original function:\n{func}')# Note we only decompile one function, where the original file may contain multiple functions + print(f'decompiled function:\n{c_func_decompile}') From 2846ce106f66ebe74fc8ebe2131eb1c27414c113 Mon Sep 17 00:00:00 2001 From: Martin <1224973+mavaa@users.noreply.github.com> Date: Fri, 24 May 2024 21:32:29 +0200 Subject: [PATCH 6/8] Removed doctest-modules cause we don't use that --- .github/workflows/python-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml index be43e9f..3b4ccfe 100644 --- a/.github/workflows/python-tests.yml +++ b/.github/workflows/python-tests.yml @@ -29,7 +29,7 @@ jobs: pip install -r requirements_dev.txt - name: Run pytest run: | - pytest --doctest-modules --junitxml=junit/test-results.xml --cov=src --cov-report=xml --cov-report=html + pytest --junitxml=junit/test-results.xml --cov=src --cov-report=xml --cov-report=html - name: Publish Test Results uses: EnricoMi/publish-unit-test-result-action@v2 if: always() From 79731da6ae5a606b810a61904e9f484d055bded6 Mon Sep 17 00:00:00 2001 From: Martin <1224973+mavaa@users.noreply.github.com> Date: Fri, 24 May 2024 23:35:55 +0200 Subject: [PATCH 7/8] Added llm4decompile submodule --- .gitmodules | 3 +++ submodules/llm4decompile | 1 + 2 files changed, 4 insertions(+) create mode 100644 .gitmodules create mode 160000 submodules/llm4decompile diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..efcecde --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "submodules/llm4decompile"] + path = submodules/llm4decompile + url = https://github.com/albertan017/LLM4Decompile diff --git a/submodules/llm4decompile b/submodules/llm4decompile new file mode 160000 index 0000000..aa23b74 --- /dev/null +++ b/submodules/llm4decompile @@ -0,0 +1 @@ +Subproject commit aa23b74ed144f50944755f772cb8a186540edfc5 From bb9227529d63c28140126ec3d2148d621b489c02 Mon Sep 17 00:00:00 2001 From: Martin <1224973+mavaa@users.noreply.github.com> Date: Fri, 24 May 2024 23:36:44 +0200 Subject: [PATCH 8/8] Script for running llm4decompile in rocm docker --- run_llm4decompile.sh | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100755 run_llm4decompile.sh diff --git a/run_llm4decompile.sh b/run_llm4decompile.sh new file mode 100755 index 0000000..d9d35b9 --- /dev/null +++ b/run_llm4decompile.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +source .venv/bin/activate +source .env + +sudo docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size 8G -v ./:/src -w /src rocm/pytorch