diff --git a/.gitignore b/.gitignore index 2e8e497..ad19a0e 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ __pycache__/ *tokens* rag_data .chromadb -*temp* \ No newline at end of file +*temp* +*downloads* \ No newline at end of file diff --git a/hw5/README.md b/hw5/README.md index b4c8934..c169df9 100644 --- a/hw5/README.md +++ b/hw5/README.md @@ -1,2 +1,32 @@ ###### David Westgate 17 May 2024 -## HW4 for gensec \ No newline at end of file +## HW5 for gensec +This application attempts to automatically solve CTF levels for CS492/CS592 Malware Reverse Engineering. +It does so by prompting the user for a specific binary level, and automatically fetching the level from the web. +If it is able to do this, the application will perform an object dump of this binary, and send it to the LLM with a prompt, +to see if the LLM can respond with the correct password. + +### Setup + Run +Install python3, then +``` +cd hw5 +pip install -r requirnments.txt +cp .env.example .env #fill in env file with key +python3 app.py +``` + +### Results +On average, this application is not very good at solving the CTF levels. This is not very suprising however, as these malware reverse engineering levels are technically difficult, and often require special tooling, debugging and subversion of anti-dissassembly and anti-debugging techniques. + +That said, I tested this application with a handful of the binary files, focusing on the earlier/easier levels of the various chapters (avoiding excessive API cost prevents me for testing all levels). A few succeeded, with others failed returning the incorrect password, or failed and acknowledged they could not figure out it due to limitations. + +#### Success +* Ch01StatA_Readelf +* Ch08Dbg_GdbIntro +#### Failed (wrong answer) +* Ch15AntiDis_FakeCallInt +* Ch21x64_ParamsStack +#### Failed (acknowledged) +* Ch15AntiDis_FakeCond +* Ch18PackUnp_UnpackEasy + +I imagine this program may perform better for CTF levels of other classes like CS205 Computer Systems Programming \ No newline at end of file diff --git a/hw5/app.py b/hw5/app.py index fdf579e..f892ede 100644 --- a/hw5/app.py +++ b/hw5/app.py @@ -1,46 +1,31 @@ -from typing import Iterable, Literal -from langchain_community.document_loaders.generic import GenericLoader -from langchain_community.document_loaders.parsers import LanguageParser -from langchain_openai import ChatOpenAI -from langchain_core.runnables import RunnablePassthrough -from langchain_core.prompts import PromptTemplate -from langchain_core.output_parsers import StrOutputParser -from langchain_community.document_loaders import FileSystemBlobLoader -from langchain_community.document_loaders.blob_loaders.schema import Blob, BlobLoader -from langchain_community.document_loaders import AsyncHtmlLoader -from langchain.chains import LLMChain -from langchain.chains import SimpleSequentialChain - -from dotenv import load_dotenv +import subprocess import requests import tempfile import zipfile import io import os -from validators import url +from langchain_openai import ChatOpenAI +from langchain_core.runnables import RunnablePassthrough +from langchain_core.prompts import PromptTemplate +from langchain_core.output_parsers import StrOutputParser +from dotenv import load_dotenv from time import time + +""" +This application attempts to automatically solve CTF levels for CS492/CS592 Malware Reverse Engineering. +It does so by prompting the user for a specific binary level, and automatically fetching the level from the web. +If it is able to do this, the application will perform an object dump of this binary, and send it to the LLM with a prompt, +to see if the LLM can respond with the correct password (copied from README.md). +""" + load_dotenv() - -""" -This application attempts to automatically solve CTF levels for CS492/CS592 Malware Reverse Engineering. -""" - - -# def get_rag_chain(): -# return ( -# {"context": retriever | format_docs, "question": RunnablePassthrough()} -# | prompt -# | llm -# | StrOutputParser() -# ) - - session = requests.Session() - url = "https://cs492.oregonctf.org/" - +""" +Binary levels are group into various zip archives on the origin. This helps us choose the right zip to download. +""" def get_group(chapter: int): if chapter >= 1 and chapter <= 8: return "Ch01-08" @@ -54,6 +39,9 @@ def get_group(chapter: int): return False +""" +The downloads for the binaries require an authenticated session. Here, we initiate that session with default provided credentials +""" def start_session(): payload = {"username": "demo0", "passwd": "malware"} @@ -64,6 +52,10 @@ def start_session(): session.post(url, data=payload, headers=headers) +""" +Once we have the level name and group, we download the zip file of the appropriate group into a scope based temporary directory. +We scan those files, to find the one with the name matching the chosen level. If it is found, we copy that into a persistant downloads directory, for later +""" def get_file_path(level, group): payload = { "setname": group, @@ -78,65 +70,74 @@ def get_file_path(level, group): with zipfile.ZipFile(zip_file) as z: z.extractall(temp_dir) extracted_files = os.listdir(temp_dir) - # print(f"Extracted files: {extracted_files}") for file_name in extracted_files: if file_name == level: temp_file_path = os.path.join(temp_dir, file_name) - saved_file_path = os.path.join("./temp", file_name) + saved_file_path = os.path.join("./downloads", file_name) os.rename(temp_file_path, saved_file_path) return saved_file_path return False +""" +Object dump the binary file, and send it in a chain with our prompt to the LLM +""" def execute(path: str): - print("path ", path) - loader = FileSystemBlobLoader(path, show_progress=True) prompt = PromptTemplate.from_template( - "Describe the attached binary file: {file_content}" - ) - llm = ChatOpenAI(model_name="gpt-4-turbo", temperature=0) - blobs: Iterable[Blob] = loader.yield_blobs() + """ + Below is an object dump for a GNU/Linux ELF 32-bit LSB executable Intel 80386 binary file recovered using objdump -s. + When run, this binary is a 'capture the flag' exercise, which tells the user some hints, then prompts "Enter the password:" : + You are an expert reverse engineer. Reverse engineer the password by following the instruction flow of the program, subverting any tricks that may make it otherwise difficult to recover the password. + As necessary, utilize the hints given in the prompt of the program. + If you are able to recover the password reply with only the password and nothing else. + If you are not able to recover the password, explain in detail why. + + ***Object Dump Begin*** - for b in blobs: - chain = SimpleSequentialChain( - prompt=prompt, - llm=llm, - ) - # print('b ',b) - result = chain.run(file_content=b) - return result + {objdump} + """ + ) + llm = ChatOpenAI(model_name="gpt-4o", temperature=0) + chain = {"objdump": RunnablePassthrough()} | prompt | llm | StrOutputParser() + objdump = subprocess.run( + ["objdump", "-s", path], capture_output=True, text=True, check=True + ) + result = chain.invoke(objdump.stdout) + return result print( - "This program will attempt to automatically solve CS 492/592 Malware Reverse Engineering CTF levels." + "\n\nThis program will attempt to automatically solve CS 492/592 Malware Reverse Engineering CTF levels.\nEnter the name of the level (example: Ch01StatA_Readelf)" ) -print("Enter the name of the level (example: Ch03DynA_Ltrace)") - - while True: try: - level: str = input("name of binary>> ") + level: str = input("\nname of binary>> ") if level: chapter = level[2:4] - if chapter.strip().isdigit(): - group = get_group(int(chapter)) + if chapter.strip().isdigit(): # parse binary for the chapter number + group = get_group(int(chapter)) # get chapter group from chapter number if group: start_time = time() - start_session() - if os.path.isfile(os.path.join("./temp", level)): - print("found") - file_path = os.path.join("./temp", level) + start_session() #start session (get session token) + if os.path.isfile(os.path.join("./downloads", level)): #If we already have this file, no need to fetch it again + print("File already found") + file_path = os.path.join("./downloads", level) else: + print("Fetching file...") file_path = get_file_path(level, group) if file_path: - results = execute(file_path) - print(results) + result = execute(file_path) + end_time = time() + elapsed_time = round(end_time - start_time, 2) + print( + "\n", result, "\n\nElapsed time: ", elapsed_time, " seconds" + ) else: raise Exception("Not a valid file") else: raise Exception("Not a valid chapter of a possible file") else: - raise Exception("Bad input format (example: Ch03DynA_Ltrace)") + raise Exception("Bad input format (example: Ch01StatA_Readelf)") else: break except Exception as e: diff --git a/hw5/requirnments.txt b/hw5/requirements.txt similarity index 68% rename from hw5/requirnments.txt rename to hw5/requirements.txt index 045d427..56761d0 100644 --- a/hw5/requirnments.txt +++ b/hw5/requirements.txt @@ -1,5 +1,5 @@ +langchain_core langchain-community langchain_openai python-dotenv -validators -esprima \ No newline at end of file +Requests