update app.py, requirnments.txt and readme; features and cleanup
This commit is contained in:
parent
1002443082
commit
3fd7e3361f
@ -1,4 +1,40 @@
|
||||
## HW1 for gensec
|
||||
This is a RAG LLM application for asking questions about Kerbal Space Program (KSP). KSP and KSP 2 are very technical games relying on orbital physics and players often want to quickly learn more about celestial body characteristics, common orbital manuevers, delta-v requirnments, and so-on. The player maintined [wiki](https://wiki.kerbalspaceprogram.com/) contains almost all of this information, so my application uses this as a primary source
|
||||
|
||||
### Enviornment
|
||||
Install python3, then run the following:
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
touch .env
|
||||
```
|
||||
After, populate .env with your OPENAI_API_KEY and FIREWALL_API_KEY
|
||||
|
||||
### Loading
|
||||
This will take some time (2 minutes for ~75), so be patient. Firewall does have support to query for the status of a long running API call, so I will add status feedback later
|
||||
```
|
||||
python3 loader.py
|
||||
```
|
||||
|
||||
### Testing
|
||||
The loaded documents can be tested at this point with the search script
|
||||
```
|
||||
python3 search.py
|
||||
```
|
||||
|
||||
### Running
|
||||
```
|
||||
python3 app.py
|
||||
```
|
||||
|
||||
### Example Questions
|
||||
```
|
||||
llm>> How much delta-V is required to exit Kerbin?
|
||||
llm>> How many moons does Jool have?
|
||||
llm>> How large is Gilly's sphere of influence?
|
||||
llm>> Describe Eve's physical characteristics
|
||||
llm>> Does the game support multiplayer?
|
||||
llm>> Which engines are good for deep space travel?
|
||||
|
||||
```
|
||||
|
||||
Enjoy!
|
49
hw1/app.py
49
hw1/app.py
@ -1 +1,48 @@
|
||||
#todo
|
||||
from langchain import hub
|
||||
from langchain_community.vectorstores import Chroma
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
||||
from dotenv import load_dotenv
|
||||
|
||||
"""
|
||||
User facing RAG application. Mostly adapted from https://github.com/wu4f/cs410g-src/blob/main/03_RAG/08_rag_query.py
|
||||
Small changes made regarding OpenAI Embedding, and loading env from dotenv.
|
||||
|
||||
I use the same rag-prompt since it's a good choice
|
||||
|
||||
"""
|
||||
|
||||
load_dotenv()
|
||||
|
||||
def format_docs(docs):
|
||||
return "\n\n".join(doc.page_content for doc in docs)
|
||||
|
||||
vectorstore = Chroma(
|
||||
embedding_function=OpenAIEmbeddings(),
|
||||
persist_directory="./rag_data/.chromadb"
|
||||
)
|
||||
prompt = hub.pull("rlm/rag-prompt")
|
||||
retriever = vectorstore.as_retriever()
|
||||
llm = ChatOpenAI(model="gpt-4")
|
||||
|
||||
rag_chain = (
|
||||
{"context": retriever | format_docs, "question": RunnablePassthrough()}
|
||||
| prompt
|
||||
| llm
|
||||
| StrOutputParser()
|
||||
)
|
||||
|
||||
print("Welcome to the Kerbal Space Program RAG application. I will try to assist you with any questions ")
|
||||
document_data_sources = set()
|
||||
for doc_metadata in retriever.vectorstore.get()['metadatas']:
|
||||
document_data_sources.add(doc_metadata['sourceURL'])
|
||||
|
||||
while True:
|
||||
line = input("llm>> ")
|
||||
if line:
|
||||
result = rag_chain.invoke(line)
|
||||
print(result)
|
||||
else:
|
||||
break
|
||||
|
||||
|
@ -15,11 +15,12 @@ Requires OPENAI_API_KEY and FIREWALL_API_KEY in .env
|
||||
|
||||
Code adapted from
|
||||
1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb
|
||||
2) https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG
|
||||
2) https://github.com/wu4f/cs410g-src/blob/main/03_RAG/06_rag_loaddb.py
|
||||
3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/
|
||||
|
||||
Firecrawl docs reference
|
||||
https://github.com/mendableai/firecrawl-py
|
||||
(Interestingly, this repo is only a week old at this time)
|
||||
|
||||
This takes a while to crawl, so just run it once and watch out for firecrawl credit usage.
|
||||
"""
|
||||
@ -27,43 +28,35 @@ This takes a while to crawl, so just run it once and watch out for firecrawl cre
|
||||
load_dotenv()
|
||||
crawl_params = {
|
||||
'crawlerOptions': {
|
||||
|
||||
#I want to exclude non-english paths, but this isn't working right yet. Needs work
|
||||
#Exclude non-english paths, image resources, etc.
|
||||
'excludes': [
|
||||
'*/cs',
|
||||
'*/da',
|
||||
'*/de',
|
||||
'*/es',
|
||||
'*/fi',
|
||||
'*/fr',
|
||||
'*/he',
|
||||
'*/hr',
|
||||
'*/hu',
|
||||
'*/it',
|
||||
'*/cs',
|
||||
'*/da',
|
||||
'*/de',
|
||||
'*/es',
|
||||
'*/fi',
|
||||
'*/fr',
|
||||
'*/he',
|
||||
'*/hr',
|
||||
'*/hu',
|
||||
'*/it',
|
||||
'*/ja',
|
||||
'*/ko',
|
||||
'*/nl',
|
||||
'*/no',
|
||||
'*/pl',
|
||||
'*/pt',
|
||||
'*/ru',
|
||||
'*/sv',
|
||||
'*/th',
|
||||
'*/tr',
|
||||
'*/zh-cn'
|
||||
'cs',
|
||||
'da',
|
||||
'de',
|
||||
'es',
|
||||
'fi',
|
||||
'fr',
|
||||
'he',
|
||||
'hr',
|
||||
'hu',
|
||||
'it',
|
||||
'ja',
|
||||
'ko',
|
||||
'nl',
|
||||
'no',
|
||||
'pl',
|
||||
'pt',
|
||||
'ru',
|
||||
'sv',
|
||||
'th',
|
||||
'tr',
|
||||
'zh-cn'
|
||||
'.jpg',
|
||||
'.png'
|
||||
'.gif'
|
||||
],
|
||||
'includes': ['wiki/*'],
|
||||
'limit': 20, #higher limit means more credits and more wait time.
|
||||
'limit': 75, #higher limit means more credits and more wait time.
|
||||
}
|
||||
}
|
||||
|
||||
@ -74,7 +67,6 @@ print("docs loaded")
|
||||
# Split
|
||||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
||||
splits = text_splitter.split_documents(docs)
|
||||
|
||||
print("split complete")
|
||||
|
||||
# Embed
|
||||
|
@ -1,25 +1,7 @@
|
||||
langchain_google_genai
|
||||
langchain-community
|
||||
grpcio
|
||||
langchain
|
||||
huggingface_hub
|
||||
bs4
|
||||
requests
|
||||
langchain_openai
|
||||
python-dotenv
|
||||
langchain-experimental
|
||||
langchainhub
|
||||
firecrawl-py
|
||||
chromadb
|
||||
pypdf
|
||||
docx2txt
|
||||
markdown
|
||||
tiktoken
|
||||
nltk
|
||||
argparse
|
||||
arxiv
|
||||
pymupdf
|
||||
wikipedia
|
||||
asyncio
|
||||
scikit-learn
|
||||
unstructured
|
||||
|
||||
langchainhub
|
@ -1,13 +1,11 @@
|
||||
from langchain_community.vectorstores import Chroma
|
||||
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
||||
import readline
|
||||
from langchain_community.vectorstores import Chroma
|
||||
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
||||
from langchain_openai import OpenAIEmbeddings
|
||||
from dotenv import load_dotenv
|
||||
|
||||
"""
|
||||
A search utility for the loaded documents, for testing and debugging
|
||||
Directly adapted from https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG with small change for env loading and OpenAI embedding
|
||||
Adapted from https://github.com/wu4f/cs410g-src/blob/main/03_RAG/07_rag_docsearch.py with small change for env loading and OpenAI embedding
|
||||
"""
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user