update app.py, requirnments.txt and readme; features and cleanup

This commit is contained in:
David Westgate 2024-04-18 23:52:54 -07:00
parent 1002443082
commit 3fd7e3361f
5 changed files with 117 additions and 62 deletions

View File

@ -1,4 +1,40 @@
## HW1 for gensec
This is a RAG LLM application for asking questions about Kerbal Space Program (KSP). KSP and KSP 2 are very technical games relying on orbital physics and players often want to quickly learn more about celestial body characteristics, common orbital manuevers, delta-v requirnments, and so-on. The player maintined [wiki](https://wiki.kerbalspaceprogram.com/) contains almost all of this information, so my application uses this as a primary source
### Enviornment
Install python3, then run the following:
```
pip install -r requirements.txt
touch .env
```
After, populate .env with your OPENAI_API_KEY and FIREWALL_API_KEY
### Loading
This will take some time (2 minutes for ~75), so be patient. Firewall does have support to query for the status of a long running API call, so I will add status feedback later
```
python3 loader.py
```
### Testing
The loaded documents can be tested at this point with the search script
```
python3 search.py
```
### Running
```
python3 app.py
```
### Example Questions
```
llm>> How much delta-V is required to exit Kerbin?
llm>> How many moons does Jool have?
llm>> How large is Gilly's sphere of influence?
llm>> Describe Eve's physical characteristics
llm>> Does the game support multiplayer?
llm>> Which engines are good for deep space travel?
```
Enjoy!

View File

@ -1 +1,48 @@
#todo
from langchain import hub
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from dotenv import load_dotenv
"""
User facing RAG application. Mostly adapted from https://github.com/wu4f/cs410g-src/blob/main/03_RAG/08_rag_query.py
Small changes made regarding OpenAI Embedding, and loading env from dotenv.
I use the same rag-prompt since it's a good choice
"""
load_dotenv()
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
vectorstore = Chroma(
embedding_function=OpenAIEmbeddings(),
persist_directory="./rag_data/.chromadb"
)
prompt = hub.pull("rlm/rag-prompt")
retriever = vectorstore.as_retriever()
llm = ChatOpenAI(model="gpt-4")
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
print("Welcome to the Kerbal Space Program RAG application. I will try to assist you with any questions ")
document_data_sources = set()
for doc_metadata in retriever.vectorstore.get()['metadatas']:
document_data_sources.add(doc_metadata['sourceURL'])
while True:
line = input("llm>> ")
if line:
result = rag_chain.invoke(line)
print(result)
else:
break

View File

@ -15,11 +15,12 @@ Requires OPENAI_API_KEY and FIREWALL_API_KEY in .env
Code adapted from
1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb
2) https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG
2) https://github.com/wu4f/cs410g-src/blob/main/03_RAG/06_rag_loaddb.py
3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/
Firecrawl docs reference
https://github.com/mendableai/firecrawl-py
(Interestingly, this repo is only a week old at this time)
This takes a while to crawl, so just run it once and watch out for firecrawl credit usage.
"""
@ -27,43 +28,35 @@ This takes a while to crawl, so just run it once and watch out for firecrawl cre
load_dotenv()
crawl_params = {
'crawlerOptions': {
#I want to exclude non-english paths, but this isn't working right yet. Needs work
#Exclude non-english paths, image resources, etc.
'excludes': [
'*/cs',
'*/da',
'*/de',
'*/es',
'*/fi',
'*/fr',
'*/he',
'*/hr',
'*/hu',
'*/it',
'*/cs',
'*/da',
'*/de',
'*/es',
'*/fi',
'*/fr',
'*/he',
'*/hr',
'*/hu',
'*/it',
'*/ja',
'*/ko',
'*/nl',
'*/no',
'*/pl',
'*/pt',
'*/ru',
'*/sv',
'*/th',
'*/tr',
'*/zh-cn'
'cs',
'da',
'de',
'es',
'fi',
'fr',
'he',
'hr',
'hu',
'it',
'ja',
'ko',
'nl',
'no',
'pl',
'pt',
'ru',
'sv',
'th',
'tr',
'zh-cn'
'.jpg',
'.png'
'.gif'
],
'includes': ['wiki/*'],
'limit': 20, #higher limit means more credits and more wait time.
'limit': 75, #higher limit means more credits and more wait time.
}
}
@ -74,7 +67,6 @@ print("docs loaded")
# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
print("split complete")
# Embed

View File

@ -1,25 +1,7 @@
langchain_google_genai
langchain-community
grpcio
langchain
huggingface_hub
bs4
requests
langchain_openai
python-dotenv
langchain-experimental
langchainhub
firecrawl-py
chromadb
pypdf
docx2txt
markdown
tiktoken
nltk
argparse
arxiv
pymupdf
wikipedia
asyncio
scikit-learn
unstructured
langchainhub

View File

@ -1,13 +1,11 @@
from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import readline
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
"""
A search utility for the loaded documents, for testing and debugging
Directly adapted from https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG with small change for env loading and OpenAI embedding
Adapted from https://github.com/wu4f/cs410g-src/blob/main/03_RAG/07_rag_docsearch.py with small change for env loading and OpenAI embedding
"""