97 lines
3.0 KiB
Python
97 lines
3.0 KiB
Python
from typing import List
|
|
from langchain_community.vectorstores import Chroma
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from langchain_openai import OpenAIEmbeddings
|
|
from langchain_community.document_loaders import FireCrawlLoader
|
|
from langchain_core.documents import Document
|
|
from dotenv import load_dotenv
|
|
|
|
"""
|
|
Loader attempting to load documents for the game Kerbal Space program 2 from
|
|
the games own fan-run wiki, using Firecrawl loader and GPT4 embedding.
|
|
|
|
Firecrawl should crawl to all links from the main page (with configured limits), making easy work of the document loader (why I chose that)
|
|
I chose OpenAI/GPT4 since I already pay for that so hopefully it's a bit better than free alternatives
|
|
|
|
Requires OPENAI_API_KEY and FIREWALL_API_KEY in .env
|
|
|
|
Code adapted from
|
|
1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb
|
|
2) https://github.com/wu4f/cs410g-src/blob/main/03_RAG/06_rag_loaddb.py
|
|
3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/
|
|
|
|
Firecrawl docs reference
|
|
https://github.com/mendableai/firecrawl-py
|
|
(Interestingly, this repo is only a week old at this time)
|
|
|
|
This takes a while to crawl, so just run it once and watch out for firecrawl credit usage.
|
|
"""
|
|
|
|
load_dotenv()
|
|
page_options = {"onlyMainContent": True}
|
|
crawl_params = {
|
|
"crawlerOptions": {
|
|
# Exclude non-english paths, image resources, etc.
|
|
"excludes": [
|
|
"cs",
|
|
"da",
|
|
"de",
|
|
"es",
|
|
"fi",
|
|
"fr",
|
|
"he",
|
|
"hr",
|
|
"hu",
|
|
"it",
|
|
"ja",
|
|
"ko",
|
|
"nl",
|
|
"no",
|
|
"pl",
|
|
"pt",
|
|
"ru",
|
|
"sv",
|
|
"th",
|
|
"tr",
|
|
"zh-cn",
|
|
".jpg",
|
|
".png",
|
|
".gif",
|
|
],
|
|
"includes": ["wiki/*"],
|
|
"limit": 75, # higher limit means more credits and more wait time.
|
|
},
|
|
"pageOptions": {"onlyMainContent": True},
|
|
}
|
|
|
|
loader = FireCrawlLoader(
|
|
"https://wiki.kerbalspaceprogram.com/wiki/Main_Page",
|
|
mode="crawl",
|
|
params=crawl_params,
|
|
)
|
|
docs: List[Document] = loader.load()
|
|
|
|
# Split
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
|
splits: List[Document] = text_splitter.split_documents(docs)
|
|
|
|
# This metadata incompatiblity issue should be resolved by the firecrawl maintiner (ogLocalaeAlternate is an empty list, not allowed by Chroma)
|
|
for doc in splits:
|
|
doc.metadata.pop("ogLocaleAlternate", None)
|
|
|
|
# Embed
|
|
vectorstore = Chroma.from_documents(
|
|
documents=splits,
|
|
embedding=OpenAIEmbeddings(),
|
|
persist_directory="./rag_data/.chromadb",
|
|
)
|
|
|
|
|
|
print("RAG database initialized with the following sources.")
|
|
retriever = vectorstore.as_retriever()
|
|
document_data_sources = set()
|
|
for doc_metadata in retriever.vectorstore.get()["metadatas"]:
|
|
document_data_sources.add(doc_metadata["sourceURL"])
|
|
for doc in document_data_sources:
|
|
print(f" {doc}")
|