update app.py, requirnments.txt and readme; features and cleanup

2024-04-18 23:52:54 -07:00 · 2024-04-18 23:52:54 -07:00 · 3fd7e3361f
commit 3fd7e3361f
parent 1002443082
5 changed files with 117 additions and 62 deletions
--- a/hw1/README.md
+++ b/hw1/README.md
@ -1,4 +1,40 @@
 ## HW1 for gensec
+This is a RAG LLM application for asking questions about Kerbal Space Program (KSP). KSP and KSP 2 are very technical games relying on orbital physics and players often want to quickly learn more about celestial body characteristics, common orbital manuevers, delta-v requirnments, and so-on. The player maintined [wiki](https://wiki.kerbalspaceprogram.com/) contains almost all of this information, so my application uses this as a primary source 

 ### Enviornment
+Install python3, then run the following:
+```
 pip install -r requirements.txt
+touch .env
+```
+After, populate .env with your OPENAI_API_KEY and FIREWALL_API_KEY
+
+### Loading
+This will take some time (2 minutes for ~75), so be patient. Firewall does have support to query for the status of a long running API call, so I will add status feedback later
+```
+python3 loader.py
+```
+
+### Testing
+The loaded documents can be tested at this point with the search script
+```
+python3 search.py
+```
+
+### Running
+```
+python3 app.py
+```
+
+### Example Questions
+```
+llm>> How much delta-V is required to exit Kerbin?
+llm>> How many moons does Jool have?
+llm>> How large is Gilly's sphere of influence?
+llm>> Describe Eve's physical characteristics
+llm>> Does the game support multiplayer?
+llm>> Which engines are good for deep space travel?
+
+```
+
+Enjoy!
--- a/hw1/app.py
+++ b/hw1/app.py
@ -1 +1,48 @@
-#todo
+from langchain import hub
+from langchain_community.vectorstores import Chroma
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from dotenv import load_dotenv
+
+"""
+User facing RAG application. Mostly adapted from https://github.com/wu4f/cs410g-src/blob/main/03_RAG/08_rag_query.py
+Small changes made regarding OpenAI Embedding, and loading env from dotenv.
+
+I use the same rag-prompt since it's a good choice
+
+"""
+
+load_dotenv()
+
+def format_docs(docs):
+    return "\n\n".join(doc.page_content for doc in docs)
+
+vectorstore = Chroma(
+    embedding_function=OpenAIEmbeddings(),
+    persist_directory="./rag_data/.chromadb"
+)
+prompt = hub.pull("rlm/rag-prompt")
+retriever = vectorstore.as_retriever()
+llm = ChatOpenAI(model="gpt-4")
+
+rag_chain = (
+    {"context": retriever | format_docs, "question": RunnablePassthrough()}
+    | prompt
+    | llm
+    | StrOutputParser()
+)
+
+print("Welcome to the Kerbal Space Program RAG application. I will try to assist you with any questions ")
+document_data_sources = set()
+for doc_metadata in retriever.vectorstore.get()['metadatas']:
+    document_data_sources.add(doc_metadata['sourceURL']) 
+
+while True:
+    line = input("llm>> ")
+    if line:
+        result = rag_chain.invoke(line)
+        print(result)
+    else:
+        break
+
--- a/hw1/loader.py
+++ b/hw1/loader.py
@ -15,11 +15,12 @@ Requires OPENAI_API_KEY and FIREWALL_API_KEY in .env

 Code adapted from 
 1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb
-2) https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG
+2) https://github.com/wu4f/cs410g-src/blob/main/03_RAG/06_rag_loaddb.py
 3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/

 Firecrawl docs reference
 https://github.com/mendableai/firecrawl-py
+(Interestingly, this repo is only a week old at this time)

 This takes a while to crawl, so just run it once and watch out for firecrawl credit usage.
 """
@ -27,43 +28,35 @@ This takes a while to crawl, so just run it once and watch out for firecrawl cre
 load_dotenv()
 crawl_params = {
    'crawlerOptions': {
-
-        #I want to exclude non-english paths, but this isn't working right yet. Needs work
+        #Exclude non-english paths, image resources, etc.
        'excludes': [
-                    '*/cs',
-                    '*/da',
-                    '*/de',
-                    '*/es',
-                    '*/fi',
-                    '*/fr',
-                    '*/he',
-                    '*/hr',
-                    '*/hu',
-                    '*/it',
-                    '*/cs',
-                    '*/da',
-                    '*/de',
-                    '*/es',
-                    '*/fi',
-                    '*/fr',
-                    '*/he',
-                    '*/hr',
-                    '*/hu',
-                    '*/it',
-                    '*/ja',
-                    '*/ko',
-                    '*/nl',
-                    '*/no',
-                    '*/pl',
-                    '*/pt',
-                    '*/ru',
-                    '*/sv',
-                    '*/th',
-                    '*/tr',
-                    '*/zh-cn'
+                'cs',
+                'da',
+                'de',
+                'es',
+                'fi',
+                'fr',
+                'he',
+                'hr',
+                'hu',
+                'it',
+                'ja',
+                'ko',
+                'nl',
+                'no',
+                'pl',
+                'pt',
+                'ru',
+                'sv',
+                'th',
+                'tr',
+                'zh-cn'
+                '.jpg',
+                '.png'
+                '.gif'
        ],
        'includes': ['wiki/*'],
-        'limit': 20, #higher limit means more credits and more wait time. 
+        'limit': 75, #higher limit means more credits and more wait time. 
    }
 }

@ -74,7 +67,6 @@ print("docs loaded")
 # Split
 text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
 splits = text_splitter.split_documents(docs)
-
 print("split complete")

 # Embed
--- a/hw1/requirements.txt
+++ b/hw1/requirements.txt
@ -1,25 +1,7 @@
-langchain_google_genai
 langchain-community
-grpcio
 langchain
-huggingface_hub
-bs4
-requests
 langchain_openai
 python-dotenv
-langchain-experimental
-langchainhub
+firecrawl-py
 chromadb
-pypdf
-docx2txt
-markdown
-tiktoken
-nltk
-argparse
-arxiv
-pymupdf
-wikipedia
-asyncio
-scikit-learn
-unstructured
-
+langchainhub
--- a/hw1/search.py
+++ b/hw1/search.py
@ -1,13 +1,11 @@
 from langchain_community.vectorstores import Chroma
-from langchain_google_genai import GoogleGenerativeAIEmbeddings
-import readline
 from langchain_community.vectorstores import Chroma
-from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from langchain_openai import OpenAIEmbeddings
 from dotenv import load_dotenv

 """
 A search utility for the loaded documents, for testing and debugging
-Directly adapted from https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG with small change for env loading and OpenAI embedding
+Adapted from https://github.com/wu4f/cs410g-src/blob/main/03_RAG/07_rag_docsearch.py with small change for env loading and OpenAI embedding
 """