From 5fadc676029a9f226a551a07a65becf129776d83 Mon Sep 17 00:00:00 2001
From: Martin <1224973+mavaa@users.noreply.github.com>
Date: Fri, 24 May 2024 03:06:31 +0200
Subject: [PATCH 1/8] Some files for testing llm4decompile

---
 codebleu_example_cases.py            | 23 +++++++++++++++++++
 requirements_llm4decompile.txt       |  7 ++++++
 test_llm4decompile.py                | 19 ++++++++++++++++
 transform_cases_for_llm4decompile.py | 34 ++++++++++++++++++++++++++++
 4 files changed, 83 insertions(+)
 create mode 100644 codebleu_example_cases.py
 create mode 100644 requirements_llm4decompile.txt
 create mode 100644 test_llm4decompile.py
 create mode 100644 transform_cases_for_llm4decompile.py

diff --git a/codebleu_example_cases.py b/codebleu_example_cases.py
new file mode 100644
index 0000000..03c47a1
--- /dev/null
+++ b/codebleu_example_cases.py
@@ -0,0 +1,23 @@
+import os
+from codebleu import calc_codebleu
+
+def read_code_from_file(file_path):
+    with open(file_path, 'r') as file:
+        code = file.read()
+    return code
+
+if __name__ == '__main__':
+    base_path="data_codebleu_examples"
+    reference_file=os.path.join(base_path, "reference.txt")
+
+    reference_code = read_code_from_file(reference_file)
+
+    for i in range(4):
+        filename=f'src{i+1}.txt'
+        prediction_code = read_code_from_file(os.path.join(base_path, filename))
+        bleu = calc_codebleu([reference_code], [prediction_code], lang="c", weights=(0.25, 0.25, 0.25, 0.25), tokenizer=None)
+
+
+        print(f"Results for {filename}:")
+        for key, value in bleu.items():
+            print(f"{key}: {value:.2%}")
diff --git a/requirements_llm4decompile.txt b/requirements_llm4decompile.txt
new file mode 100644
index 0000000..ce1dd92
--- /dev/null
+++ b/requirements_llm4decompile.txt
@@ -0,0 +1,7 @@
+tqdm
+transformers
+loguru
+tqdm
+text_generation
+vllm
+flash-attn
diff --git a/test_llm4decompile.py b/test_llm4decompile.py
new file mode 100644
index 0000000..a4f277b
--- /dev/null
+++ b/test_llm4decompile.py
@@ -0,0 +1,19 @@
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+
+model_path = 'LLM4Binary/llm4decompile-6.7b-v1.5' # V1.5 Model
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+model = AutoModelForCausalLM.from_pretrained(model_path,torch_dtype=torch.bfloat16).cuda()
+
+with open("data_decompile_eval/llm4decompileprompts/task_000_d.txt_prompt.txt_.asm",'r') as f:#optimization level O0
+    asm_func = f.read()
+inputs = tokenizer(asm_func, return_tensors="pt").to(model.device)
+with torch.no_grad():
+    outputs = model.generate(**inputs, max_new_tokens=2048)### max length to 4096, max new tokens should be below the range
+c_func_decompile = tokenizer.decode(outputs[0][len(inputs[0]):-1])
+
+with open(fileName +'.c','r') as f:#original file
+    func = f.read()
+
+print(f'original function:\n{func}')# Note we only decompile one function, where the original file may contain multiple functions
+print(f'decompiled function:\n{c_func_decompile}')
diff --git a/transform_cases_for_llm4decompile.py b/transform_cases_for_llm4decompile.py
new file mode 100644
index 0000000..50690b0
--- /dev/null
+++ b/transform_cases_for_llm4decompile.py
@@ -0,0 +1,34 @@
+import os
+from src.util import create_folder_if_not_exists
+
+base_dir = "data_decompile_eval/disassemblies"
+prompt_folder = os.path.join('data_decompile_eval', 'llm4decompileprompts')
+create_folder_if_not_exists(prompt_folder)
+
+for output_file in sorted(os.listdir(base_dir)):
+    prompt_filename = os.path.join(prompt_folder, f"{output_file}_prompt.txt")
+    input_asm = ''
+    file_path = os.path.join(base_dir, output_file)
+    print(f"reading {file_path}")
+    with open(file_path) as f:#asm file
+        asm= f.read()
+        if '<'+'.text'+'>:' not in asm: #IMPORTANT replace func0 with the function name
+            raise ValueError("compile fails")
+        asm = '<'+'.text'+'>:' + asm.split('<'+'.text'+'>:')[-1].split('\n\n')[0] #IMPORTANT replace func0 with the function name
+        asm_clean = ""
+        asm_sp = asm.split("\n")
+        for tmp in asm_sp:
+            if len(tmp.split("\t"))<3 and '00' in tmp:
+                continue
+            idx = min(
+                len(tmp.split("\t")) - 1, 2
+            )
+            tmp_asm = "\t".join(tmp.split("\t")[idx:])  # remove the binary code
+            tmp_asm = tmp_asm.split("#")[0].strip()  # remove the comments
+            asm_clean += tmp_asm + "\n"
+    input_asm = asm_clean.strip()
+    before = f"# This is the assembly code:\n"#prompt
+    after = "\n# What is the source code?\n"#prompt
+    input_asm_prompt = before+input_asm.strip()+after
+    with open(prompt_filename +'_' + '.asm','w',encoding='utf-8') as f:
+        f.write(input_asm_prompt)

From 18dc52efb55692ab403ec95edca92261271a129c Mon Sep 17 00:00:00 2001
From: Martin <1224973+mavaa@users.noreply.github.com>
Date: Fri, 24 May 2024 09:47:47 +0200
Subject: [PATCH 2/8] codebleu example test

---
 data_codebleu_examples/reference.txt | 1 +
 data_codebleu_examples/src1.txt      | 1 +
 data_codebleu_examples/src2.txt      | 1 +
 data_codebleu_examples/src3.txt      | 1 +
 data_codebleu_examples/src4.txt      | 1 +
 5 files changed, 5 insertions(+)
 create mode 100644 data_codebleu_examples/reference.txt
 create mode 100644 data_codebleu_examples/src1.txt
 create mode 100644 data_codebleu_examples/src2.txt
 create mode 100644 data_codebleu_examples/src3.txt
 create mode 100644 data_codebleu_examples/src4.txt

diff --git a/data_codebleu_examples/reference.txt b/data_codebleu_examples/reference.txt
new file mode 100644
index 0000000..100a612
--- /dev/null
+++ b/data_codebleu_examples/reference.txt
@@ -0,0 +1 @@
+float trun_num(float num) { return num - (int)num; }
diff --git a/data_codebleu_examples/src1.txt b/data_codebleu_examples/src1.txt
new file mode 100644
index 0000000..4828ff8
--- /dev/null
+++ b/data_codebleu_examples/src1.txt
@@ -0,0 +1 @@
+float trun_num(float num) { return (int)num - (int)num; }
diff --git a/data_codebleu_examples/src2.txt b/data_codebleu_examples/src2.txt
new file mode 100644
index 0000000..b669906
--- /dev/null
+++ b/data_codebleu_examples/src2.txt
@@ -0,0 +1 @@
+float trun_num(float num) { return num - num; }
diff --git a/data_codebleu_examples/src3.txt b/data_codebleu_examples/src3.txt
new file mode 100644
index 0000000..ae5db8f
--- /dev/null
+++ b/data_codebleu_examples/src3.txt
@@ -0,0 +1 @@
+float func(float x) { return x - int(x); }
diff --git a/data_codebleu_examples/src4.txt b/data_codebleu_examples/src4.txt
new file mode 100644
index 0000000..a357103
--- /dev/null
+++ b/data_codebleu_examples/src4.txt
@@ -0,0 +1 @@
+float func(float f) { int i = (int)f; return f - i; }

From 272e88589bd5545b168fbcbdc59e93faceb43785 Mon Sep 17 00:00:00 2001
From: Martin <1224973+mavaa@users.noreply.github.com>
Date: Fri, 24 May 2024 19:41:46 +0200
Subject: [PATCH 3/8] Renamed llm4decompile test so that pytest wont try to run
 it

---
 test_llm4decompile.py => runtest_llm4decompile.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename test_llm4decompile.py => runtest_llm4decompile.py (100%)

diff --git a/test_llm4decompile.py b/runtest_llm4decompile.py
similarity index 100%
rename from test_llm4decompile.py
rename to runtest_llm4decompile.py

From ddc22bb8759926db616ec91b111cf22572f59e99 Mon Sep 17 00:00:00 2001
From: Martin <1224973+mavaa@users.noreply.github.com>
Date: Fri, 24 May 2024 20:58:44 +0200
Subject: [PATCH 4/8] Had to rename again apparently...

---
 runtest_llm4decompile.py => run_llm4decompile.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename runtest_llm4decompile.py => run_llm4decompile.py (100%)

diff --git a/runtest_llm4decompile.py b/run_llm4decompile.py
similarity index 100%
rename from runtest_llm4decompile.py
rename to run_llm4decompile.py

From ddc4acf829fff5d910ddcf9c9314a00a47aca62d Mon Sep 17 00:00:00 2001
From: Martin <1224973+mavaa@users.noreply.github.com>
Date: Fri, 24 May 2024 21:03:08 +0200
Subject: [PATCH 5/8] Added if check that apparently fixes stuff?

---
 run_llm4decompile.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/run_llm4decompile.py b/run_llm4decompile.py
index a4f277b..146766f 100644
--- a/run_llm4decompile.py
+++ b/run_llm4decompile.py
@@ -1,19 +1,20 @@
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 
-model_path = 'LLM4Binary/llm4decompile-6.7b-v1.5' # V1.5 Model
-tokenizer = AutoTokenizer.from_pretrained(model_path)
-model = AutoModelForCausalLM.from_pretrained(model_path,torch_dtype=torch.bfloat16).cuda()
+if __name__ == '__main__':
+    model_path = 'LLM4Binary/llm4decompile-6.7b-v1.5' # V1.5 Model
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    model = AutoModelForCausalLM.from_pretrained(model_path,torch_dtype=torch.bfloat16).cuda()
 
-with open("data_decompile_eval/llm4decompileprompts/task_000_d.txt_prompt.txt_.asm",'r') as f:#optimization level O0
-    asm_func = f.read()
-inputs = tokenizer(asm_func, return_tensors="pt").to(model.device)
-with torch.no_grad():
-    outputs = model.generate(**inputs, max_new_tokens=2048)### max length to 4096, max new tokens should be below the range
-c_func_decompile = tokenizer.decode(outputs[0][len(inputs[0]):-1])
+    with open("data_decompile_eval/llm4decompileprompts/task_000_d.txt_prompt.txt_.asm",'r') as f:#optimization level O0
+        asm_func = f.read()
+    inputs = tokenizer(asm_func, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        outputs = model.generate(**inputs, max_new_tokens=2048)### max length to 4096, max new tokens should be below the range
+    c_func_decompile = tokenizer.decode(outputs[0][len(inputs[0]):-1])
 
-with open(fileName +'.c','r') as f:#original file
-    func = f.read()
+    with open(fileName +'.c','r') as f:#original file
+        func = f.read()
 
-print(f'original function:\n{func}')# Note we only decompile one function, where the original file may contain multiple functions
-print(f'decompiled function:\n{c_func_decompile}')
+    print(f'original function:\n{func}')# Note we only decompile one function, where the original file may contain multiple functions
+    print(f'decompiled function:\n{c_func_decompile}')

From 2846ce106f66ebe74fc8ebe2131eb1c27414c113 Mon Sep 17 00:00:00 2001
From: Martin <1224973+mavaa@users.noreply.github.com>
Date: Fri, 24 May 2024 21:32:29 +0200
Subject: [PATCH 6/8] Removed doctest-modules cause we don't use that

---
 .github/workflows/python-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml
index be43e9f..3b4ccfe 100644
--- a/.github/workflows/python-tests.yml
+++ b/.github/workflows/python-tests.yml
@@ -29,7 +29,7 @@ jobs:
         pip install -r requirements_dev.txt
     - name: Run pytest
       run: |
-        pytest --doctest-modules --junitxml=junit/test-results.xml --cov=src --cov-report=xml --cov-report=html
+        pytest --junitxml=junit/test-results.xml --cov=src --cov-report=xml --cov-report=html
     - name: Publish Test Results
       uses: EnricoMi/publish-unit-test-result-action@v2
       if: always()

From 79731da6ae5a606b810a61904e9f484d055bded6 Mon Sep 17 00:00:00 2001
From: Martin <1224973+mavaa@users.noreply.github.com>
Date: Fri, 24 May 2024 23:35:55 +0200
Subject: [PATCH 7/8] Added llm4decompile submodule

---
 .gitmodules              | 3 +++
 submodules/llm4decompile | 1 +
 2 files changed, 4 insertions(+)
 create mode 100644 .gitmodules
 create mode 160000 submodules/llm4decompile

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..efcecde
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "submodules/llm4decompile"]
+	path = submodules/llm4decompile
+	url = https://github.com/albertan017/LLM4Decompile
diff --git a/submodules/llm4decompile b/submodules/llm4decompile
new file mode 160000
index 0000000..aa23b74
--- /dev/null
+++ b/submodules/llm4decompile
@@ -0,0 +1 @@
+Subproject commit aa23b74ed144f50944755f772cb8a186540edfc5

From bb9227529d63c28140126ec3d2148d621b489c02 Mon Sep 17 00:00:00 2001
From: Martin <1224973+mavaa@users.noreply.github.com>
Date: Fri, 24 May 2024 23:36:44 +0200
Subject: [PATCH 8/8] Script for running llm4decompile in rocm docker

---
 run_llm4decompile.sh | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100755 run_llm4decompile.sh

diff --git a/run_llm4decompile.sh b/run_llm4decompile.sh
new file mode 100755
index 0000000..d9d35b9
--- /dev/null
+++ b/run_llm4decompile.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+source .venv/bin/activate
+source .env
+
+sudo docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size 8G -v ./:/src -w /src rocm/pytorch