This repository has been archived on 2025-04-28. You can view files and clone it, but cannot push or open issues or pull requests.
gensec-westgate-djw2/hw1/loader.py

97 lines
3.0 KiB
Python

from typing import List
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import FireCrawlLoader
from langchain_core.documents import Document
from dotenv import load_dotenv
"""
Loader attempting to load documents for the game Kerbal Space program 2 from
the games own fan-run wiki, using Firecrawl loader and GPT4 embedding.
Firecrawl should crawl to all links from the main page (with configured limits), making easy work of the document loader (why I chose that)
I chose OpenAI/GPT4 since I already pay for that so hopefully it's a bit better than free alternatives
Requires OPENAI_API_KEY and FIREWALL_API_KEY in .env
Code adapted from
1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb
2) https://github.com/wu4f/cs410g-src/blob/main/03_RAG/06_rag_loaddb.py
3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/
Firecrawl docs reference
https://github.com/mendableai/firecrawl-py
(Interestingly, this repo is only a week old at this time)
This takes a while to crawl, so just run it once and watch out for firecrawl credit usage.
"""
load_dotenv()
page_options = {"onlyMainContent": True}
crawl_params = {
"crawlerOptions": {
# Exclude non-english paths, image resources, etc.
"excludes": [
"cs",
"da",
"de",
"es",
"fi",
"fr",
"he",
"hr",
"hu",
"it",
"ja",
"ko",
"nl",
"no",
"pl",
"pt",
"ru",
"sv",
"th",
"tr",
"zh-cn",
".jpg",
".png",
".gif",
],
"includes": ["wiki/*"],
"limit": 75, # higher limit means more credits and more wait time.
},
"pageOptions": {"onlyMainContent": True},
}
loader = FireCrawlLoader(
"https://wiki.kerbalspaceprogram.com/wiki/Main_Page",
mode="crawl",
params=crawl_params,
)
docs: List[Document] = loader.load()
# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits: List[Document] = text_splitter.split_documents(docs)
# This metadata incompatiblity issue should be resolved by the firecrawl maintiner (ogLocalaeAlternate is an empty list, not allowed by Chroma)
for doc in splits:
doc.metadata.pop("ogLocaleAlternate", None)
# Embed
vectorstore = Chroma.from_documents(
documents=splits,
embedding=OpenAIEmbeddings(),
persist_directory="./rag_data/.chromadb",
)
print("RAG database initialized with the following sources.")
retriever = vectorstore.as_retriever()
document_data_sources = set()
for doc_metadata in retriever.vectorstore.get()["metadatas"]:
document_data_sources.add(doc_metadata["sourceURL"])
for doc in document_data_sources:
print(f" {doc}")