From 266d5953cccae16071ebb272a420142649982d47 Mon Sep 17 00:00:00 2001
From: David Westgate <davidjwestgate@gmail.com>
Date: Tue, 30 Apr 2024 16:00:31 -0700
Subject: [PATCH] format all scripts; fix loader issue from hw1 firecrawl;
 start hw3

---
 hw1/app.py    | 16 +++++----
 hw1/loader.py | 94 +++++++++++++++++++++++++++++----------------------
 hw1/search.py | 13 ++++---
 hw2/app.py    | 27 +++++++++------
 hw2/tools.py  | 37 +++++++++++++-------
 hw3/notes.MD  | 17 ++++++++++
 6 files changed, 129 insertions(+), 75 deletions(-)
 create mode 100644 hw3/notes.MD

diff --git a/hw1/app.py b/hw1/app.py
index 15430b5..82ce56b 100644
--- a/hw1/app.py
+++ b/hw1/app.py
@@ -15,9 +15,11 @@ I use the same rag-prompt since it's a good choice
 
 load_dotenv()
 
+
 def format_docs(docs):
     return "\n\n".join(doc.page_content for doc in docs)
 
+
 def get_rag_chain():
     return (
         {"context": retriever | format_docs, "question": RunnablePassthrough()}
@@ -28,19 +30,20 @@ def get_rag_chain():
 
 
 vectorstore = Chroma(
-    embedding_function=OpenAIEmbeddings(),
-    persist_directory="./rag_data/.chromadb"
+    embedding_function=OpenAIEmbeddings(), persist_directory="./rag_data/.chromadb"
 )
 prompt = hub.pull("rlm/rag-prompt")
 retriever = vectorstore.as_retriever()
 llm = ChatOpenAI(model="gpt-4")
 
 document_data_sources = set()
-for doc_metadata in retriever.vectorstore.get()['metadatas']:
-    document_data_sources.add(doc_metadata['sourceURL']) 
+for doc_metadata in retriever.vectorstore.get()["metadatas"]:
+    document_data_sources.add(doc_metadata["sourceURL"])
 
-if __name__ == "__main__" :
-    print("Welcome to the Kerbal Space Program RAG application. I will try to assist you with any questions ")
+if __name__ == "__main__":
+    print(
+        "Welcome to the Kerbal Space Program RAG application. I will try to assist you with any questions "
+    )
     while True:
         line = input("llm>> ")
         if line:
@@ -48,4 +51,3 @@ if __name__ == "__main__" :
             print(result)
         else:
             break
-
diff --git a/hw1/loader.py b/hw1/loader.py
index 2701ec0..3e57e47 100644
--- a/hw1/loader.py
+++ b/hw1/loader.py
@@ -1,7 +1,9 @@
+from typing import List
 from langchain_community.vectorstores import Chroma
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_openai import OpenAIEmbeddings
 from langchain_community.document_loaders import FireCrawlLoader
+from langchain_core.documents import Document
 from dotenv import load_dotenv
 
 """
@@ -26,59 +28,69 @@ This takes a while to crawl, so just run it once and watch out for firecrawl cre
 """
 
 load_dotenv()
+page_options = {"onlyMainContent": True}
 crawl_params = {
-    'crawlerOptions': {
-        #Exclude non-english paths, image resources, etc.
-        'excludes': [
-                'cs',
-                'da',
-                'de',
-                'es',
-                'fi',
-                'fr',
-                'he',
-                'hr',
-                'hu',
-                'it',
-                'ja',
-                'ko',
-                'nl',
-                'no',
-                'pl',
-                'pt',
-                'ru',
-                'sv',
-                'th',
-                'tr',
-                'zh-cn'
-                '.jpg',
-                '.png'
-                '.gif'
+    "crawlerOptions": {
+        # Exclude non-english paths, image resources, etc.
+        "excludes": [
+            "cs",
+            "da",
+            "de",
+            "es",
+            "fi",
+            "fr",
+            "he",
+            "hr",
+            "hu",
+            "it",
+            "ja",
+            "ko",
+            "nl",
+            "no",
+            "pl",
+            "pt",
+            "ru",
+            "sv",
+            "th",
+            "tr",
+            "zh-cn",
+            ".jpg",
+            ".png",
+            ".gif",
         ],
-        'includes': ['wiki/*'],
-        'limit': 75, #higher limit means more credits and more wait time. 
-    }
+        "includes": ["wiki/*"],
+        "limit": 75,  # higher limit means more credits and more wait time.
+    },
+    "pageOptions": {"onlyMainContent": True},
 }
 
-loader = FireCrawlLoader("https://wiki.kerbalspaceprogram.com/wiki/Main_Page",  mode="crawl", params=crawl_params)
-docs = loader.load()
-print("docs loaded")
+loader = FireCrawlLoader(
+    "https://wiki.kerbalspaceprogram.com/wiki/Main_Page",
+    mode="crawl",
+    params=crawl_params,
+)
+docs: List[Document] = loader.load()
 
 # Split
 text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-splits = text_splitter.split_documents(docs)
-print("split complete")
+splits: List[Document] = text_splitter.split_documents(docs)
+
+# This metadata incompatiblity issue should be resolved by the firecrawl maintiner (ogLocalaeAlternate is an empty list, not allowed by Chroma)
+for doc in splits:
+    doc.metadata.pop("ogLocaleAlternate", None)
 
 # Embed
-vectorstore = Chroma.from_documents(documents=splits, 
-                                    embedding=OpenAIEmbeddings(),
-                                    persist_directory="./rag_data/.chromadb")
+vectorstore = Chroma.from_documents(
+    documents=splits,
+    embedding=OpenAIEmbeddings(),
+    persist_directory="./rag_data/.chromadb",
+)
 
 
 print("RAG database initialized with the following sources.")
 retriever = vectorstore.as_retriever()
 document_data_sources = set()
-for doc_metadata in retriever.vectorstore.get()['metadatas']:
-    document_data_sources.add(doc_metadata['sourceURL']) 
+for doc_metadata in retriever.vectorstore.get()["metadatas"]:
+    document_data_sources.add(doc_metadata["sourceURL"])
 for doc in document_data_sources:
-    print(f"  {doc}")
\ No newline at end of file
+    print(f"  {doc}")
diff --git a/hw1/search.py b/hw1/search.py
index a75914d..d5032c8 100644
--- a/hw1/search.py
+++ b/hw1/search.py
@@ -11,10 +11,10 @@ Adapted from https://github.com/wu4f/cs410g-src/blob/main/03_RAG/07_rag_docsearc
 
 load_dotenv()
 vectorstore = Chroma(
-    embedding_function=OpenAIEmbeddings(),
-    persist_directory="./rag_data/.chromadb"
+    embedding_function=OpenAIEmbeddings(), persist_directory="./rag_data/.chromadb"
 )
 
+
 def search_db(query):
     docs = vectorstore.similarity_search(query)
     print(f"Query database for: {query}")
@@ -23,16 +23,19 @@ def search_db(query):
     else:
         print("No matching documents")
 
+
 print("RAG database initialized.")
 retriever = vectorstore.as_retriever()
 document_data_sources = set()
-for doc_metadata in retriever.vectorstore.get()['metadatas']:
+for doc_metadata in retriever.vectorstore.get()["metadatas"]:
     print(f"docm {doc_metadata}")
-    document_data_sources.add(doc_metadata['sourceURL']) 
+    document_data_sources.add(doc_metadata["sourceURL"])
 for doc in document_data_sources:
     print(f"  {doc}")
 
-print("This program queries documents in the RAG database that are similar to whatever is entered.")
+print(
+    "This program queries documents in the RAG database that are similar to whatever is entered."
+)
 while True:
     line = input(">> ")
     if line:
diff --git a/hw2/app.py b/hw2/app.py
index 0e981a3..79f036a 100644
--- a/hw2/app.py
+++ b/hw2/app.py
@@ -8,7 +8,8 @@ from langchain_community.utilities.google_jobs import GoogleJobsAPIWrapper
 
 from dotenv import load_dotenv
 from tools import lookup_ip, lookup_name, search_ksp
-#from langsmith import Client
+
+# from langsmith import Client
 
 """
 This is the main runner of the custom agent. Custom agent tools are defined seperatly and imported from tools.py
@@ -19,27 +20,33 @@ Langsmith code can be uncommeted for testing/debugging
 """
 
 load_dotenv()
-#os.environ["LANGCHAIN_TRACING_V2"] = "true"
-#os.environ["LANGCHAIN_PROJECT"] = f"LangSmith Introduction"
-#os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
-#client = Client()
+# os.environ["LANGCHAIN_TRACING_V2"] = "true"
+# os.environ["LANGCHAIN_PROJECT"] = f"LangSmith Introduction"
+# os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
+# client = Client()
 llm = ChatOpenAI(model_name="gpt-4-turbo", temperature=0)
 
-tools = load_tools(["serpapi", "terminal", "dalle-image-generator", "google-jobs"], allow_dangerous_tools=True, llm=llm)
+tools = load_tools(
+    ["serpapi", "terminal", "dalle-image-generator", "google-jobs"],
+    allow_dangerous_tools=True,
+    llm=llm,
+)
 tools.extend([lookup_name, lookup_ip, search_ksp])
 
 base_prompt = hub.pull("langchain-ai/react-agent-template")
-prompt = base_prompt.partial(instructions="Answer the user's request utilizing at most 8 tool calls")
-agent = create_react_agent(llm,tools,prompt)
+prompt = base_prompt.partial(
+    instructions="Answer the user's request utilizing at most 8 tool calls"
+)
+agent = create_react_agent(llm, tools, prompt)
 agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
 print("Welcome to my application. I am configured with these tools:")
 for tool in agent_executor.tools:
-  print(f'  Tool: {tool.name} = {tool.description}')
+    print(f"  Tool: {tool.name} = {tool.description}")
 while True:
     line = input("llm>> ")
     try:
         if line:
-            result = agent_executor.invoke({"input":line})
+            result = agent_executor.invoke({"input": line})
             print(result)
         else:
             break
diff --git a/hw2/tools.py b/hw2/tools.py
index 6269e67..69dc754 100644
--- a/hw2/tools.py
+++ b/hw2/tools.py
@@ -4,7 +4,8 @@ import dns.resolver, dns.reversename
 import validators
 import sys
 import os
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 from hw1.app import get_rag_chain
 
 """
@@ -14,34 +15,46 @@ These are the same tooling provided by the example in https://github.com/wu4f/cs
 with the addition of my Kerbal Space Program RAG Application tool
 """
 
+
 class LookupNameInput(BaseModel):
     hostname: str = Field(description="Should be a hostname such as www.google.com")
+
     @root_validator
-    def is_dns_address(cls, values: dict[str,any]) -> str:
+    def is_dns_address(cls, values: dict[str, any]) -> str:
         if validators.domain(values.get("hostname")):
             return values
         raise ValueError("Malformed hostname")
+
+
 class LookupIPInput(BaseModel):
-    address: str = Field(description="Should be an IP address such as 208.91.197.27 or 143.95.239.83")
+    address: str = Field(
+        description="Should be an IP address such as 208.91.197.27 or 143.95.239.83"
+    )
+
     @root_validator
-    def is_ip_address(cls, values: dict[str,any]) -> str:
+    def is_ip_address(cls, values: dict[str, any]) -> str:
         if validators.ip_address.ipv4(values.get("address")):
             return values
         raise ValueError("Malformed IP address")
-    
+
+
 class KSPTool(BaseModel):
-    query: str = Field(description="should be a kerbal space program (ksp) related query")
+    query: str = Field(
+        description="should be a kerbal space program (ksp) related query"
+    )
+
 
 @tool("kerbal_space_program_ksp_information", args_schema=KSPTool, return_direct=False)
-def search_ksp(query:str) -> str:
+def search_ksp(query: str) -> str:
     """Given a query about kerbal space program (ksp), it will send the query to the KSP rag applciation"""
     return get_rag_chain().invoke(query)
 
-@tool("lookup_name",args_schema=LookupNameInput, return_direct=False)
+
+@tool("lookup_name", args_schema=LookupNameInput, return_direct=False)
 def lookup_name(hostname):
     """Given a DNS hostname, it will return its IPv4 addresses"""
-    result = dns.resolver.resolve(hostname, 'A')
-    res = [ r.to_text() for r in result ]
+    result = dns.resolver.resolve(hostname, "A")
+    res = [r.to_text() for r in result]
     return res[0]
 
 
@@ -49,6 +62,6 @@ def lookup_name(hostname):
 def lookup_ip(address):
     """Given an IP address, returns names associated with it"""
     n = dns.reversename.from_address(address)
-    result = dns.resolver.resolve(n, 'PTR')
-    res = [ r.to_text() for r in result ]
+    result = dns.resolver.resolve(n, "PTR")
+    res = [r.to_text() for r in result]
     return res[0]
diff --git a/hw3/notes.MD b/hw3/notes.MD
new file mode 100644
index 0000000..939687a
--- /dev/null
+++ b/hw3/notes.MD
@@ -0,0 +1,17 @@
+# Security testing
+
+## LangChain RAG application (hw1)
+### Indirect prompt injection
+todo
+### Insecure output handling
+todo
+### Data poisoning
+todo
+
+## LangChain agent (hw2)
+### Excessive agency
+todo
+### Insecure tool design
+todo
+### Sensitive information exposure
+todo