update app.py, requirnments.txt and readme; features and cleanup
This commit is contained in:
parent
1002443082
commit
3fd7e3361f
@ -1,4 +1,40 @@
|
|||||||
## HW1 for gensec
|
## HW1 for gensec
|
||||||
|
This is a RAG LLM application for asking questions about Kerbal Space Program (KSP). KSP and KSP 2 are very technical games relying on orbital physics and players often want to quickly learn more about celestial body characteristics, common orbital manuevers, delta-v requirnments, and so-on. The player maintined [wiki](https://wiki.kerbalspaceprogram.com/) contains almost all of this information, so my application uses this as a primary source
|
||||||
|
|
||||||
### Enviornment
|
### Enviornment
|
||||||
pip install -r requirements.txt
|
Install python3, then run the following:
|
||||||
|
```
|
||||||
|
pip install -r requirements.txt
|
||||||
|
touch .env
|
||||||
|
```
|
||||||
|
After, populate .env with your OPENAI_API_KEY and FIREWALL_API_KEY
|
||||||
|
|
||||||
|
### Loading
|
||||||
|
This will take some time (2 minutes for ~75), so be patient. Firewall does have support to query for the status of a long running API call, so I will add status feedback later
|
||||||
|
```
|
||||||
|
python3 loader.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Testing
|
||||||
|
The loaded documents can be tested at this point with the search script
|
||||||
|
```
|
||||||
|
python3 search.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Running
|
||||||
|
```
|
||||||
|
python3 app.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example Questions
|
||||||
|
```
|
||||||
|
llm>> How much delta-V is required to exit Kerbin?
|
||||||
|
llm>> How many moons does Jool have?
|
||||||
|
llm>> How large is Gilly's sphere of influence?
|
||||||
|
llm>> Describe Eve's physical characteristics
|
||||||
|
llm>> Does the game support multiplayer?
|
||||||
|
llm>> Which engines are good for deep space travel?
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
Enjoy!
|
49
hw1/app.py
49
hw1/app.py
@ -1 +1,48 @@
|
|||||||
#todo
|
from langchain import hub
|
||||||
|
from langchain_community.vectorstores import Chroma
|
||||||
|
from langchain_core.output_parsers import StrOutputParser
|
||||||
|
from langchain_core.runnables import RunnablePassthrough
|
||||||
|
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
"""
|
||||||
|
User facing RAG application. Mostly adapted from https://github.com/wu4f/cs410g-src/blob/main/03_RAG/08_rag_query.py
|
||||||
|
Small changes made regarding OpenAI Embedding, and loading env from dotenv.
|
||||||
|
|
||||||
|
I use the same rag-prompt since it's a good choice
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
def format_docs(docs):
|
||||||
|
return "\n\n".join(doc.page_content for doc in docs)
|
||||||
|
|
||||||
|
vectorstore = Chroma(
|
||||||
|
embedding_function=OpenAIEmbeddings(),
|
||||||
|
persist_directory="./rag_data/.chromadb"
|
||||||
|
)
|
||||||
|
prompt = hub.pull("rlm/rag-prompt")
|
||||||
|
retriever = vectorstore.as_retriever()
|
||||||
|
llm = ChatOpenAI(model="gpt-4")
|
||||||
|
|
||||||
|
rag_chain = (
|
||||||
|
{"context": retriever | format_docs, "question": RunnablePassthrough()}
|
||||||
|
| prompt
|
||||||
|
| llm
|
||||||
|
| StrOutputParser()
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Welcome to the Kerbal Space Program RAG application. I will try to assist you with any questions ")
|
||||||
|
document_data_sources = set()
|
||||||
|
for doc_metadata in retriever.vectorstore.get()['metadatas']:
|
||||||
|
document_data_sources.add(doc_metadata['sourceURL'])
|
||||||
|
|
||||||
|
while True:
|
||||||
|
line = input("llm>> ")
|
||||||
|
if line:
|
||||||
|
result = rag_chain.invoke(line)
|
||||||
|
print(result)
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
@ -15,11 +15,12 @@ Requires OPENAI_API_KEY and FIREWALL_API_KEY in .env
|
|||||||
|
|
||||||
Code adapted from
|
Code adapted from
|
||||||
1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb
|
1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb
|
||||||
2) https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG
|
2) https://github.com/wu4f/cs410g-src/blob/main/03_RAG/06_rag_loaddb.py
|
||||||
3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/
|
3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/
|
||||||
|
|
||||||
Firecrawl docs reference
|
Firecrawl docs reference
|
||||||
https://github.com/mendableai/firecrawl-py
|
https://github.com/mendableai/firecrawl-py
|
||||||
|
(Interestingly, this repo is only a week old at this time)
|
||||||
|
|
||||||
This takes a while to crawl, so just run it once and watch out for firecrawl credit usage.
|
This takes a while to crawl, so just run it once and watch out for firecrawl credit usage.
|
||||||
"""
|
"""
|
||||||
@ -27,43 +28,35 @@ This takes a while to crawl, so just run it once and watch out for firecrawl cre
|
|||||||
load_dotenv()
|
load_dotenv()
|
||||||
crawl_params = {
|
crawl_params = {
|
||||||
'crawlerOptions': {
|
'crawlerOptions': {
|
||||||
|
#Exclude non-english paths, image resources, etc.
|
||||||
#I want to exclude non-english paths, but this isn't working right yet. Needs work
|
|
||||||
'excludes': [
|
'excludes': [
|
||||||
'*/cs',
|
'cs',
|
||||||
'*/da',
|
'da',
|
||||||
'*/de',
|
'de',
|
||||||
'*/es',
|
'es',
|
||||||
'*/fi',
|
'fi',
|
||||||
'*/fr',
|
'fr',
|
||||||
'*/he',
|
'he',
|
||||||
'*/hr',
|
'hr',
|
||||||
'*/hu',
|
'hu',
|
||||||
'*/it',
|
'it',
|
||||||
'*/cs',
|
'ja',
|
||||||
'*/da',
|
'ko',
|
||||||
'*/de',
|
'nl',
|
||||||
'*/es',
|
'no',
|
||||||
'*/fi',
|
'pl',
|
||||||
'*/fr',
|
'pt',
|
||||||
'*/he',
|
'ru',
|
||||||
'*/hr',
|
'sv',
|
||||||
'*/hu',
|
'th',
|
||||||
'*/it',
|
'tr',
|
||||||
'*/ja',
|
'zh-cn'
|
||||||
'*/ko',
|
'.jpg',
|
||||||
'*/nl',
|
'.png'
|
||||||
'*/no',
|
'.gif'
|
||||||
'*/pl',
|
|
||||||
'*/pt',
|
|
||||||
'*/ru',
|
|
||||||
'*/sv',
|
|
||||||
'*/th',
|
|
||||||
'*/tr',
|
|
||||||
'*/zh-cn'
|
|
||||||
],
|
],
|
||||||
'includes': ['wiki/*'],
|
'includes': ['wiki/*'],
|
||||||
'limit': 20, #higher limit means more credits and more wait time.
|
'limit': 75, #higher limit means more credits and more wait time.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -74,7 +67,6 @@ print("docs loaded")
|
|||||||
# Split
|
# Split
|
||||||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
||||||
splits = text_splitter.split_documents(docs)
|
splits = text_splitter.split_documents(docs)
|
||||||
|
|
||||||
print("split complete")
|
print("split complete")
|
||||||
|
|
||||||
# Embed
|
# Embed
|
||||||
|
@ -1,25 +1,7 @@
|
|||||||
langchain_google_genai
|
|
||||||
langchain-community
|
langchain-community
|
||||||
grpcio
|
|
||||||
langchain
|
langchain
|
||||||
huggingface_hub
|
|
||||||
bs4
|
|
||||||
requests
|
|
||||||
langchain_openai
|
langchain_openai
|
||||||
python-dotenv
|
python-dotenv
|
||||||
langchain-experimental
|
firecrawl-py
|
||||||
langchainhub
|
|
||||||
chromadb
|
chromadb
|
||||||
pypdf
|
langchainhub
|
||||||
docx2txt
|
|
||||||
markdown
|
|
||||||
tiktoken
|
|
||||||
nltk
|
|
||||||
argparse
|
|
||||||
arxiv
|
|
||||||
pymupdf
|
|
||||||
wikipedia
|
|
||||||
asyncio
|
|
||||||
scikit-learn
|
|
||||||
unstructured
|
|
||||||
|
|
@ -1,13 +1,11 @@
|
|||||||
from langchain_community.vectorstores import Chroma
|
from langchain_community.vectorstores import Chroma
|
||||||
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
|
||||||
import readline
|
|
||||||
from langchain_community.vectorstores import Chroma
|
from langchain_community.vectorstores import Chroma
|
||||||
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
from langchain_openai import OpenAIEmbeddings
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
"""
|
"""
|
||||||
A search utility for the loaded documents, for testing and debugging
|
A search utility for the loaded documents, for testing and debugging
|
||||||
Directly adapted from https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG with small change for env loading and OpenAI embedding
|
Adapted from https://github.com/wu4f/cs410g-src/blob/main/03_RAG/07_rag_docsearch.py with small change for env loading and OpenAI embedding
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user