84 lines
2.8 KiB
Python
84 lines
2.8 KiB
Python
from langchain_community.vectorstores import Chroma
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from langchain_openai import OpenAIEmbeddings
|
|
from langchain_community.document_loaders import FireCrawlLoader
|
|
from dotenv import load_dotenv
|
|
|
|
"""
|
|
Loader attempting to load documents for the game Kerbal Space program 2 from
|
|
the games own fan-run wiki, using Firecrawl loader and GPT4 embedding.
|
|
|
|
Firecrawl should crawl to all links from the main page (with configured limits), making easy work of the document loader (why I chose that)
|
|
I chose OpenAI/GPT4 since I already pay for that so hopefully it's a bit better than free alternatives
|
|
|
|
Requires OPENAI_API_KEY and FIREWALL_API_KEY in .env
|
|
|
|
Code adapted from
|
|
1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb
|
|
2) https://github.com/wu4f/cs410g-src/blob/main/03_RAG/06_rag_loaddb.py
|
|
3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/
|
|
|
|
Firecrawl docs reference
|
|
https://github.com/mendableai/firecrawl-py
|
|
(Interestingly, this repo is only a week old at this time)
|
|
|
|
This takes a while to crawl, so just run it once and watch out for firecrawl credit usage.
|
|
"""
|
|
|
|
load_dotenv()
|
|
crawl_params = {
|
|
'crawlerOptions': {
|
|
#Exclude non-english paths, image resources, etc.
|
|
'excludes': [
|
|
'cs',
|
|
'da',
|
|
'de',
|
|
'es',
|
|
'fi',
|
|
'fr',
|
|
'he',
|
|
'hr',
|
|
'hu',
|
|
'it',
|
|
'ja',
|
|
'ko',
|
|
'nl',
|
|
'no',
|
|
'pl',
|
|
'pt',
|
|
'ru',
|
|
'sv',
|
|
'th',
|
|
'tr',
|
|
'zh-cn'
|
|
'.jpg',
|
|
'.png'
|
|
'.gif'
|
|
],
|
|
'includes': ['wiki/*'],
|
|
'limit': 75, #higher limit means more credits and more wait time.
|
|
}
|
|
}
|
|
|
|
loader = FireCrawlLoader("https://wiki.kerbalspaceprogram.com/wiki/Main_Page", mode="crawl", params=crawl_params)
|
|
docs = loader.load()
|
|
print("docs loaded")
|
|
|
|
# Split
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
|
splits = text_splitter.split_documents(docs)
|
|
print("split complete")
|
|
|
|
# Embed
|
|
vectorstore = Chroma.from_documents(documents=splits,
|
|
embedding=OpenAIEmbeddings(),
|
|
persist_directory="./rag_data/.chromadb")
|
|
|
|
|
|
print("RAG database initialized with the following sources.")
|
|
retriever = vectorstore.as_retriever()
|
|
document_data_sources = set()
|
|
for doc_metadata in retriever.vectorstore.get()['metadatas']:
|
|
document_data_sources.add(doc_metadata['sourceURL'])
|
|
for doc in document_data_sources:
|
|
print(f" {doc}") |