diff --git a/hw1/README.md b/hw1/README.md index e5f60e6..44c0916 100644 --- a/hw1/README.md +++ b/hw1/README.md @@ -1,4 +1,40 @@ ## HW1 for gensec +This is a RAG LLM application for asking questions about Kerbal Space Program (KSP). KSP and KSP 2 are very technical games relying on orbital physics and players often want to quickly learn more about celestial body characteristics, common orbital manuevers, delta-v requirnments, and so-on. The player maintined [wiki](https://wiki.kerbalspaceprogram.com/) contains almost all of this information, so my application uses this as a primary source ### Enviornment -pip install -r requirements.txt \ No newline at end of file +Install python3, then run the following: +``` +pip install -r requirements.txt +touch .env +``` +After, populate .env with your OPENAI_API_KEY and FIREWALL_API_KEY + +### Loading +This will take some time (2 minutes for ~75), so be patient. Firewall does have support to query for the status of a long running API call, so I will add status feedback later +``` +python3 loader.py +``` + +### Testing +The loaded documents can be tested at this point with the search script +``` +python3 search.py +``` + +### Running +``` +python3 app.py +``` + +### Example Questions +``` +llm>> How much delta-V is required to exit Kerbin? +llm>> How many moons does Jool have? +llm>> How large is Gilly's sphere of influence? +llm>> Describe Eve's physical characteristics +llm>> Does the game support multiplayer? +llm>> Which engines are good for deep space travel? + +``` + +Enjoy! \ No newline at end of file diff --git a/hw1/app.py b/hw1/app.py index 8b68f79..4dca458 100644 --- a/hw1/app.py +++ b/hw1/app.py @@ -1 +1,48 @@ -#todo \ No newline at end of file +from langchain import hub +from langchain_community.vectorstores import Chroma +from langchain_core.output_parsers import StrOutputParser +from langchain_core.runnables import RunnablePassthrough +from langchain_openai import ChatOpenAI, OpenAIEmbeddings +from dotenv import load_dotenv + +""" +User facing RAG application. Mostly adapted from https://github.com/wu4f/cs410g-src/blob/main/03_RAG/08_rag_query.py +Small changes made regarding OpenAI Embedding, and loading env from dotenv. + +I use the same rag-prompt since it's a good choice + +""" + +load_dotenv() + +def format_docs(docs): + return "\n\n".join(doc.page_content for doc in docs) + +vectorstore = Chroma( + embedding_function=OpenAIEmbeddings(), + persist_directory="./rag_data/.chromadb" +) +prompt = hub.pull("rlm/rag-prompt") +retriever = vectorstore.as_retriever() +llm = ChatOpenAI(model="gpt-4") + +rag_chain = ( + {"context": retriever | format_docs, "question": RunnablePassthrough()} + | prompt + | llm + | StrOutputParser() +) + +print("Welcome to the Kerbal Space Program RAG application. I will try to assist you with any questions ") +document_data_sources = set() +for doc_metadata in retriever.vectorstore.get()['metadatas']: + document_data_sources.add(doc_metadata['sourceURL']) + +while True: + line = input("llm>> ") + if line: + result = rag_chain.invoke(line) + print(result) + else: + break + diff --git a/hw1/loader.py b/hw1/loader.py index 1bcd1c4..2701ec0 100644 --- a/hw1/loader.py +++ b/hw1/loader.py @@ -15,11 +15,12 @@ Requires OPENAI_API_KEY and FIREWALL_API_KEY in .env Code adapted from 1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb -2) https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG +2) https://github.com/wu4f/cs410g-src/blob/main/03_RAG/06_rag_loaddb.py 3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/ Firecrawl docs reference https://github.com/mendableai/firecrawl-py +(Interestingly, this repo is only a week old at this time) This takes a while to crawl, so just run it once and watch out for firecrawl credit usage. """ @@ -27,43 +28,35 @@ This takes a while to crawl, so just run it once and watch out for firecrawl cre load_dotenv() crawl_params = { 'crawlerOptions': { - - #I want to exclude non-english paths, but this isn't working right yet. Needs work + #Exclude non-english paths, image resources, etc. 'excludes': [ - '*/cs', - '*/da', - '*/de', - '*/es', - '*/fi', - '*/fr', - '*/he', - '*/hr', - '*/hu', - '*/it', - '*/cs', - '*/da', - '*/de', - '*/es', - '*/fi', - '*/fr', - '*/he', - '*/hr', - '*/hu', - '*/it', - '*/ja', - '*/ko', - '*/nl', - '*/no', - '*/pl', - '*/pt', - '*/ru', - '*/sv', - '*/th', - '*/tr', - '*/zh-cn' + 'cs', + 'da', + 'de', + 'es', + 'fi', + 'fr', + 'he', + 'hr', + 'hu', + 'it', + 'ja', + 'ko', + 'nl', + 'no', + 'pl', + 'pt', + 'ru', + 'sv', + 'th', + 'tr', + 'zh-cn' + '.jpg', + '.png' + '.gif' ], 'includes': ['wiki/*'], - 'limit': 20, #higher limit means more credits and more wait time. + 'limit': 75, #higher limit means more credits and more wait time. } } @@ -74,7 +67,6 @@ print("docs loaded") # Split text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) splits = text_splitter.split_documents(docs) - print("split complete") # Embed diff --git a/hw1/requirements.txt b/hw1/requirements.txt index 7647d69..467fd59 100644 --- a/hw1/requirements.txt +++ b/hw1/requirements.txt @@ -1,25 +1,7 @@ -langchain_google_genai langchain-community -grpcio langchain -huggingface_hub -bs4 -requests langchain_openai python-dotenv -langchain-experimental -langchainhub +firecrawl-py chromadb -pypdf -docx2txt -markdown -tiktoken -nltk -argparse -arxiv -pymupdf -wikipedia -asyncio -scikit-learn -unstructured - +langchainhub \ No newline at end of file diff --git a/hw1/search.py b/hw1/search.py index 2ef218f..a75914d 100644 --- a/hw1/search.py +++ b/hw1/search.py @@ -1,13 +1,11 @@ from langchain_community.vectorstores import Chroma -from langchain_google_genai import GoogleGenerativeAIEmbeddings -import readline from langchain_community.vectorstores import Chroma -from langchain_openai import ChatOpenAI, OpenAIEmbeddings +from langchain_openai import OpenAIEmbeddings from dotenv import load_dotenv """ A search utility for the loaded documents, for testing and debugging -Directly adapted from https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG with small change for env loading and OpenAI embedding +Adapted from https://github.com/wu4f/cs410g-src/blob/main/03_RAG/07_rag_docsearch.py with small change for env loading and OpenAI embedding """