gensec-westgate-djw2/hw1/loader.py

from typing import List
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import FireCrawlLoader
from langchain_core.documents import Document
from dotenv import load_dotenv

"""
Loader attempting to load documents for the game Kerbal Space program 2 from
the games own fan-run wiki, using Firecrawl loader and GPT4 embedding.

Firecrawl should crawl to all links from the main page (with configured limits), making easy work of the document loader (why I chose that)
I chose OpenAI/GPT4 since I already pay for that so hopefully it's a bit better than free alternatives

Requires OPENAI_API_KEY and FIREWALL_API_KEY in .env

Code adapted from
1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb
2) https://github.com/wu4f/cs410g-src/blob/main/03_RAG/06_rag_loaddb.py
3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/

Firecrawl docs reference
https://github.com/mendableai/firecrawl-py
(Interestingly, this repo is only a week old at this time)

This takes a while to crawl, so just run it once and watch out for firecrawl credit usage.
"""

load_dotenv()
page_options = {"onlyMainContent": True}
crawl_params = {
    "crawlerOptions": {
        # Exclude non-english paths, image resources, etc.
        "excludes": [
            "cs",
            "da",
            "de",
            "es",
            "fi",
            "fr",
            "he",
            "hr",
            "hu",
            "it",
            "ja",
            "ko",
            "nl",
            "no",
            "pl",
            "pt",
            "ru",
            "sv",
            "th",
            "tr",
            "zh-cn",
            ".jpg",
            ".png",
            ".gif",
        ],
        "includes": ["wiki/*"],
        "limit": 75,  # higher limit means more credits and more wait time.
    },
    "pageOptions": {"onlyMainContent": True},
}

loader = FireCrawlLoader(
    "https://wiki.kerbalspaceprogram.com/wiki/Main_Page",
    mode="crawl",
    params=crawl_params,
)
docs: List[Document] = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits: List[Document] = text_splitter.split_documents(docs)

# This metadata incompatiblity issue should be resolved by the firecrawl maintiner (ogLocalaeAlternate is an empty list, not allowed by Chroma)
for doc in splits:
    doc.metadata.pop("ogLocaleAlternate", None)

# Embed
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=OpenAIEmbeddings(),
    persist_directory="./rag_data/.chromadb",
)


print("RAG database initialized with the following sources.")
retriever = vectorstore.as_retriever()
document_data_sources = set()
for doc_metadata in retriever.vectorstore.get()["metadatas"]:
    document_data_sources.add(doc_metadata["sourceURL"])
for doc in document_data_sources:
    print(f"  {doc}")