update app.py, requirnments.txt and readme; features and cleanup

This commit is contained in:
David Westgate 2024-04-18 23:52:54 -07:00
parent 1002443082
commit 3fd7e3361f
5 changed files with 117 additions and 62 deletions

View File

@ -1,4 +1,40 @@
## HW1 for gensec ## HW1 for gensec
This is a RAG LLM application for asking questions about Kerbal Space Program (KSP). KSP and KSP 2 are very technical games relying on orbital physics and players often want to quickly learn more about celestial body characteristics, common orbital manuevers, delta-v requirnments, and so-on. The player maintined [wiki](https://wiki.kerbalspaceprogram.com/) contains almost all of this information, so my application uses this as a primary source
### Enviornment ### Enviornment
pip install -r requirements.txt Install python3, then run the following:
```
pip install -r requirements.txt
touch .env
```
After, populate .env with your OPENAI_API_KEY and FIREWALL_API_KEY
### Loading
This will take some time (2 minutes for ~75), so be patient. Firewall does have support to query for the status of a long running API call, so I will add status feedback later
```
python3 loader.py
```
### Testing
The loaded documents can be tested at this point with the search script
```
python3 search.py
```
### Running
```
python3 app.py
```
### Example Questions
```
llm>> How much delta-V is required to exit Kerbin?
llm>> How many moons does Jool have?
llm>> How large is Gilly's sphere of influence?
llm>> Describe Eve's physical characteristics
llm>> Does the game support multiplayer?
llm>> Which engines are good for deep space travel?
```
Enjoy!

View File

@ -1 +1,48 @@
#todo from langchain import hub
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from dotenv import load_dotenv
"""
User facing RAG application. Mostly adapted from https://github.com/wu4f/cs410g-src/blob/main/03_RAG/08_rag_query.py
Small changes made regarding OpenAI Embedding, and loading env from dotenv.
I use the same rag-prompt since it's a good choice
"""
load_dotenv()
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
vectorstore = Chroma(
embedding_function=OpenAIEmbeddings(),
persist_directory="./rag_data/.chromadb"
)
prompt = hub.pull("rlm/rag-prompt")
retriever = vectorstore.as_retriever()
llm = ChatOpenAI(model="gpt-4")
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
print("Welcome to the Kerbal Space Program RAG application. I will try to assist you with any questions ")
document_data_sources = set()
for doc_metadata in retriever.vectorstore.get()['metadatas']:
document_data_sources.add(doc_metadata['sourceURL'])
while True:
line = input("llm>> ")
if line:
result = rag_chain.invoke(line)
print(result)
else:
break

View File

@ -15,11 +15,12 @@ Requires OPENAI_API_KEY and FIREWALL_API_KEY in .env
Code adapted from Code adapted from
1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb 1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb
2) https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG 2) https://github.com/wu4f/cs410g-src/blob/main/03_RAG/06_rag_loaddb.py
3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/ 3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/
Firecrawl docs reference Firecrawl docs reference
https://github.com/mendableai/firecrawl-py https://github.com/mendableai/firecrawl-py
(Interestingly, this repo is only a week old at this time)
This takes a while to crawl, so just run it once and watch out for firecrawl credit usage. This takes a while to crawl, so just run it once and watch out for firecrawl credit usage.
""" """
@ -27,43 +28,35 @@ This takes a while to crawl, so just run it once and watch out for firecrawl cre
load_dotenv() load_dotenv()
crawl_params = { crawl_params = {
'crawlerOptions': { 'crawlerOptions': {
#Exclude non-english paths, image resources, etc.
#I want to exclude non-english paths, but this isn't working right yet. Needs work
'excludes': [ 'excludes': [
'*/cs', 'cs',
'*/da', 'da',
'*/de', 'de',
'*/es', 'es',
'*/fi', 'fi',
'*/fr', 'fr',
'*/he', 'he',
'*/hr', 'hr',
'*/hu', 'hu',
'*/it', 'it',
'*/cs', 'ja',
'*/da', 'ko',
'*/de', 'nl',
'*/es', 'no',
'*/fi', 'pl',
'*/fr', 'pt',
'*/he', 'ru',
'*/hr', 'sv',
'*/hu', 'th',
'*/it', 'tr',
'*/ja', 'zh-cn'
'*/ko', '.jpg',
'*/nl', '.png'
'*/no', '.gif'
'*/pl',
'*/pt',
'*/ru',
'*/sv',
'*/th',
'*/tr',
'*/zh-cn'
], ],
'includes': ['wiki/*'], 'includes': ['wiki/*'],
'limit': 20, #higher limit means more credits and more wait time. 'limit': 75, #higher limit means more credits and more wait time.
} }
} }
@ -74,7 +67,6 @@ print("docs loaded")
# Split # Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs) splits = text_splitter.split_documents(docs)
print("split complete") print("split complete")
# Embed # Embed

View File

@ -1,25 +1,7 @@
langchain_google_genai
langchain-community langchain-community
grpcio
langchain langchain
huggingface_hub
bs4
requests
langchain_openai langchain_openai
python-dotenv python-dotenv
langchain-experimental firecrawl-py
langchainhub
chromadb chromadb
pypdf langchainhub
docx2txt
markdown
tiktoken
nltk
argparse
arxiv
pymupdf
wikipedia
asyncio
scikit-learn
unstructured

View File

@ -1,13 +1,11 @@
from langchain_community.vectorstores import Chroma from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import readline
from langchain_community.vectorstores import Chroma from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv from dotenv import load_dotenv
""" """
A search utility for the loaded documents, for testing and debugging A search utility for the loaded documents, for testing and debugging
Directly adapted from https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG with small change for env loading and OpenAI embedding Adapted from https://github.com/wu4f/cs410g-src/blob/main/03_RAG/07_rag_docsearch.py with small change for env loading and OpenAI embedding
""" """