update app.py, requirnments.txt and readme; features and cleanup

2024-04-18 23:52:54 -07:00 · 2024-04-18 23:52:54 -07:00 · 3fd7e3361f
commit 3fd7e3361f
parent 1002443082
5 changed files with 117 additions and 62 deletions
--- a/hw1/README.md
+++ b/hw1/README.md
@ -1,4 +1,40 @@
 ## HW1 for gensec
 This is a RAG LLM application for asking questions about Kerbal Space Program (KSP). KSP and KSP 2 are very technical games relying on orbital physics and players often want to quickly learn more about celestial body characteristics, common orbital manuevers, delta-v requirnments, and so-on. The player maintined [wiki](https://wiki.kerbalspaceprogram.com/) contains almost all of this information, so my application uses this as a primary source 
 ### Enviornment
-pip install -r requirements.txt
+Install python3, then run the following:
 ```
 pip install -r requirements.txt
 touch .env
 ```
 After, populate .env with your OPENAI_API_KEY and FIREWALL_API_KEY
 ### Loading
 This will take some time (2 minutes for ~75), so be patient. Firewall does have support to query for the status of a long running API call, so I will add status feedback later
 ```
 python3 loader.py
 ```
 ### Testing
 The loaded documents can be tested at this point with the search script
 ```
 python3 search.py
 ```
 ### Running
 ```
 python3 app.py
 ```
 ### Example Questions
 ```
 llm>> How much delta-V is required to exit Kerbin?
 llm>> How many moons does Jool have?
 llm>> How large is Gilly's sphere of influence?
 llm>> Describe Eve's physical characteristics
 llm>> Does the game support multiplayer?
 llm>> Which engines are good for deep space travel?
 ```
 Enjoy!
--- a/hw1/app.py
+++ b/hw1/app.py
@ -1 +1,48 @@
-#todo
+from langchain import hub
 from langchain_community.vectorstores import Chroma
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnablePassthrough
 from langchain_openai import ChatOpenAI, OpenAIEmbeddings
 from dotenv import load_dotenv
 """
 User facing RAG application. Mostly adapted from https://github.com/wu4f/cs410g-src/blob/main/03_RAG/08_rag_query.py
 Small changes made regarding OpenAI Embedding, and loading env from dotenv.
 I use the same rag-prompt since it's a good choice
 """
 load_dotenv()
 def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
 vectorstore = Chroma(
    embedding_function=OpenAIEmbeddings(),
    persist_directory="./rag_data/.chromadb"
 )
 prompt = hub.pull("rlm/rag-prompt")
 retriever = vectorstore.as_retriever()
 llm = ChatOpenAI(model="gpt-4")
 rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
 )
 print("Welcome to the Kerbal Space Program RAG application. I will try to assist you with any questions ")
 document_data_sources = set()
 for doc_metadata in retriever.vectorstore.get()['metadatas']:
    document_data_sources.add(doc_metadata['sourceURL']) 
 while True:
    line = input("llm>> ")
    if line:
        result = rag_chain.invoke(line)
        print(result)
    else:
        break
--- a/hw1/loader.py
+++ b/hw1/loader.py
@ -15,11 +15,12 @@ Requires OPENAI_API_KEY and FIREWALL_API_KEY in .env
 Code adapted from 
 1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb
-2) https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG
+2) https://github.com/wu4f/cs410g-src/blob/main/03_RAG/06_rag_loaddb.py
 3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/
 Firecrawl docs reference
 https://github.com/mendableai/firecrawl-py
 (Interestingly, this repo is only a week old at this time)
 This takes a while to crawl, so just run it once and watch out for firecrawl credit usage.
 """
@ -27,43 +28,35 @@ This takes a while to crawl, so just run it once and watch out for firecrawl cre
 load_dotenv()
 crawl_params = {
    'crawlerOptions': {
-
+        #Exclude non-english paths, image resources, etc.
        #I want to exclude non-english paths, but this isn't working right yet. Needs work
        'excludes': [
-                    '*/cs',
+                'cs',
-                    '*/da',
+                'da',
-                    '*/de',
+                'de',
-                    '*/es',
+                'es',
-                    '*/fi',
+                'fi',
-                    '*/fr',
+                'fr',
-                    '*/he',
+                'he',
-                    '*/hr',
+                'hr',
-                    '*/hu',
+                'hu',
-                    '*/it',
+                'it',
-                    '*/cs',
+                'ja',
-                    '*/da',
+                'ko',
-                    '*/de',
+                'nl',
-                    '*/es',
+                'no',
-                    '*/fi',
+                'pl',
-                    '*/fr',
+                'pt',
-                    '*/he',
+                'ru',
-                    '*/hr',
+                'sv',
-                    '*/hu',
+                'th',
-                    '*/it',
+                'tr',
-                    '*/ja',
+                'zh-cn'
-                    '*/ko',
+                '.jpg',
-                    '*/nl',
+                '.png'
-                    '*/no',
+                '.gif'
                    '*/pl',
                    '*/pt',
                    '*/ru',
                    '*/sv',
                    '*/th',
                    '*/tr',
                    '*/zh-cn'
        ],
        'includes': ['wiki/*'],
-        'limit': 20, #higher limit means more credits and more wait time. 
+        'limit': 75, #higher limit means more credits and more wait time. 
    }
 }
@ -74,7 +67,6 @@ print("docs loaded")
 # Split
 text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
 splits = text_splitter.split_documents(docs)
 print("split complete")
 # Embed
--- a/hw1/requirements.txt
+++ b/hw1/requirements.txt
@ -1,25 +1,7 @@
 langchain_google_genai
 langchain-community
 grpcio
 langchain
 huggingface_hub
 bs4
 requests
 langchain_openai
 python-dotenv
-langchain-experimental
+firecrawl-py
 langchainhub
 chromadb
-pypdf
+langchainhub
 docx2txt
 markdown
 tiktoken
 nltk
 argparse
 arxiv
 pymupdf
 wikipedia
 asyncio
 scikit-learn
 unstructured
--- a/hw1/search.py
+++ b/hw1/search.py
@ -1,13 +1,11 @@
 from langchain_community.vectorstores import Chroma
 from langchain_google_genai import GoogleGenerativeAIEmbeddings
 import readline
 from langchain_community.vectorstores import Chroma
-from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from langchain_openai import OpenAIEmbeddings
 from dotenv import load_dotenv
 """
 A search utility for the loaded documents, for testing and debugging
-Directly adapted from https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG with small change for env loading and OpenAI embedding
+Adapted from https://github.com/wu4f/cs410g-src/blob/main/03_RAG/07_rag_docsearch.py with small change for env loading and OpenAI embedding
 """