update readme, format app and comment app, update reqs

2024-05-16 14:42:51 -07:00 · 2024-05-16 14:42:51 -07:00 · c4b0cc744d
commit c4b0cc744d
parent 14682a55b5
4 changed files with 98 additions and 66 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,3 +6,4 @@ __pycache__/
 rag_data
 .chromadb
 *temp*
 *downloads*
--- a/hw5/README.md
+++ b/hw5/README.md
@ -1,2 +1,32 @@
 ###### David Westgate 17 May 2024
-## HW4 for gensec
+## HW5 for gensec
 This application attempts to automatically solve CTF levels for CS492/CS592 Malware Reverse Engineering. 
 It does so by prompting the user for a specific binary level, and automatically fetching the level from the web.
 If it is able to do this, the application will perform an object dump of this binary, and send it to the LLM with a prompt, 
 to see if the LLM can respond with the correct password. 
 ### Setup + Run
 Install python3, then
 ```
 cd hw5
 pip install -r requirnments.txt
 cp .env.example .env #fill in env file with key
 python3 app.py
 ```
 ### Results
 On average, this application is not very good at solving the CTF levels. This is not very suprising however, as these malware reverse engineering levels are technically difficult, and often require special tooling, debugging and subversion of anti-dissassembly and anti-debugging techniques. 
 That said, I tested this application with a handful of the binary files, focusing on the earlier/easier levels of the various chapters (avoiding excessive API cost prevents me for testing all levels). A few succeeded, with others failed returning the incorrect password, or failed and acknowledged they could not figure out it due to limitations.
 #### Success
 * Ch01StatA_Readelf
 * Ch08Dbg_GdbIntro
 #### Failed (wrong answer)
 * Ch15AntiDis_FakeCallInt
 * Ch21x64_ParamsStack
 #### Failed (acknowledged)
 * Ch15AntiDis_FakeCond
 * Ch18PackUnp_UnpackEasy
 I imagine this program may perform better for CTF levels of other classes like CS205 Computer Systems Programming
--- a/hw5/app.py
+++ b/hw5/app.py
@ -1,46 +1,31 @@
-from typing import Iterable, Literal
+import subprocess
 from langchain_community.document_loaders.generic import GenericLoader
 from langchain_community.document_loaders.parsers import LanguageParser
 from langchain_openai import ChatOpenAI
 from langchain_core.runnables import RunnablePassthrough
 from langchain_core.prompts import PromptTemplate
 from langchain_core.output_parsers import StrOutputParser
 from langchain_community.document_loaders import FileSystemBlobLoader
 from langchain_community.document_loaders.blob_loaders.schema import Blob, BlobLoader
 from langchain_community.document_loaders import AsyncHtmlLoader
 from langchain.chains import LLMChain
 from langchain.chains import SimpleSequentialChain
 from dotenv import load_dotenv
 import requests
 import tempfile
 import zipfile
 import io
 import os
-from validators import url
+from langchain_openai import ChatOpenAI
 from langchain_core.runnables import RunnablePassthrough
 from langchain_core.prompts import PromptTemplate
 from langchain_core.output_parsers import StrOutputParser
 from dotenv import load_dotenv
 from time import time
 load_dotenv()
 """
 This application attempts to automatically solve CTF levels for CS492/CS592 Malware Reverse Engineering. 
 It does so by prompting the user for a specific binary level, and automatically fetching the level from the web.
 If it is able to do this, the application will perform an object dump of this binary, and send it to the LLM with a prompt, 
 to see if the LLM can respond with the correct password (copied from README.md). 
 """
-
+load_dotenv()
 # def get_rag_chain():
 #     return (
 #         {"context": retriever | format_docs, "question": RunnablePassthrough()}
 #         | prompt
 #         | llm
 #         | StrOutputParser()
 #     )
 session = requests.Session()
 url = "https://cs492.oregonctf.org/"
-
+"""
 Binary levels are group into various zip archives on the origin. This helps us choose the right zip to download.
 """
 def get_group(chapter: int):
    if chapter >= 1 and chapter <= 8:
        return "Ch01-08"
@ -54,6 +39,9 @@ def get_group(chapter: int):
        return False
 """
 The downloads for the binaries require an authenticated session. Here, we initiate that session with default provided credentials
 """
 def start_session():
    payload = {"username": "demo0", "passwd": "malware"}
@ -64,6 +52,10 @@ def start_session():
    session.post(url, data=payload, headers=headers)
 """
 Once we have the level name and group, we download the zip file of the appropriate group into a scope based temporary directory. 
 We scan those files, to find the one with the name matching the chosen level. If it is found, we copy that into a persistant downloads directory, for later
 """
 def get_file_path(level, group):
    payload = {
        "setname": group,
@ -78,65 +70,74 @@ def get_file_path(level, group):
            with zipfile.ZipFile(zip_file) as z:
                z.extractall(temp_dir)
                extracted_files = os.listdir(temp_dir)
                # print(f"Extracted files: {extracted_files}")
                for file_name in extracted_files:
                    if file_name == level:
                        temp_file_path = os.path.join(temp_dir, file_name)
-                        saved_file_path = os.path.join("./temp", file_name)
+                        saved_file_path = os.path.join("./downloads", file_name)
                        os.rename(temp_file_path, saved_file_path)
                        return saved_file_path
    return False
 """
 Object dump the binary file, and send it in a chain with our prompt to the LLM
 """
 def execute(path: str):
    print("path ", path)
    loader = FileSystemBlobLoader(path, show_progress=True)
    prompt = PromptTemplate.from_template(
-        "Describe the attached binary file: {file_content}"
+        """
-    )
+                                          Below is an object dump for a GNU/Linux ELF 32-bit LSB executable Intel 80386 binary file recovered using objdump -s. 
-    llm = ChatOpenAI(model_name="gpt-4-turbo", temperature=0)
+                                          When run, this binary is a 'capture the flag' exercise, which tells the user some hints, then prompts "Enter the password:"  : 
-    blobs: Iterable[Blob] = loader.yield_blobs()
+                                          You are an expert reverse engineer. Reverse engineer the password by following the instruction flow of the program, subverting any tricks that may make it otherwise difficult to recover the password.
                                          As necessary, utilize the hints given in the prompt of the program. 
                                          If you are able to recover the password reply with only the password and nothing else.
                                          If you are not able to recover the password, explain in detail why.    
-    for b in blobs:
+                                          ***Object Dump Begin***
-        chain = SimpleSequentialChain(
+
-            prompt=prompt,
+                                          {objdump}
-            llm=llm,
+                                          """
    )
-        # print('b ',b)
+    llm = ChatOpenAI(model_name="gpt-4o", temperature=0)
-        result = chain.run(file_content=b)
+    chain = {"objdump": RunnablePassthrough()} | prompt | llm | StrOutputParser()
    objdump = subprocess.run(
        ["objdump", "-s", path], capture_output=True, text=True, check=True
    )
    result = chain.invoke(objdump.stdout)
    return result
 print(
-    "This program will attempt to automatically solve CS 492/592 Malware Reverse Engineering CTF levels."
+    "\n\nThis program will attempt to automatically solve CS 492/592 Malware Reverse Engineering CTF levels.\nEnter the name of the level (example: Ch01StatA_Readelf)"
 )
 print("Enter the name of the level (example: Ch03DynA_Ltrace)")
 while True:
    try:
-        level: str = input("name of binary>> ")
+        level: str = input("\nname of binary>> ")
        if level:
            chapter = level[2:4]
-            if chapter.strip().isdigit():
+            if chapter.strip().isdigit(): # parse binary for the chapter number
-                group = get_group(int(chapter))
+                group = get_group(int(chapter)) # get chapter group from chapter number
                if group:
                    start_time = time()
-                    start_session()
+                    start_session()         #start session (get session token)
-                    if os.path.isfile(os.path.join("./temp", level)):
+                    if os.path.isfile(os.path.join("./downloads", level)): #If we already have this file, no need to fetch it again
-                        print("found")
+                        print("File already found")
-                        file_path = os.path.join("./temp", level)
+                        file_path = os.path.join("./downloads", level)
                    else:
                        print("Fetching file...")
                        file_path = get_file_path(level, group)
                    if file_path:
-                        results = execute(file_path)
+                        result = execute(file_path)
-                        print(results)
+                        end_time = time()
                        elapsed_time = round(end_time - start_time, 2)
                        print(
                            "\n", result, "\n\nElapsed time: ", elapsed_time, " seconds"
                        )
                    else:
                        raise Exception("Not a valid file")
                else:
                    raise Exception("Not a valid chapter of a possible file")
            else:
-                raise Exception("Bad input format (example: Ch03DynA_Ltrace)")
+                raise Exception("Bad input format (example: Ch01StatA_Readelf)")
        else:
            break
    except Exception as e:
--- a/hw5/requirements.txt
+++ b/hw5/requirements.txt
@ -1,5 +1,5 @@
 langchain_core
 langchain-community
 langchain_openai
 python-dotenv
-validators
+Requests
 esprima