gensec-westgate-djw2/hw1/loader.py

from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import FireCrawlLoader
from dotenv import load_dotenv

"""
Loader attempting to load documents for the game Kerbal Space program 2 from
the games own fan-run wiki, using Firecrawl loader and GPT4 embedding.

Firecrawl should crawl to all links from the main page (with configured limits), making easy work of the document loader (why I chose that)
I chose OpenAI/GPT4 since I already pay for that so hopefully it's a bit better than free alternatives

Requires OPENAI_API_KEY and FIREWALL_API_KEY in .env

Code adapted from
1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb
2) https://github.com/wu4f/cs410g-src/blob/main/03_RAG/06_rag_loaddb.py
3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/

Firecrawl docs reference
https://github.com/mendableai/firecrawl-py
(Interestingly, this repo is only a week old at this time)

This takes a while to crawl, so just run it once and watch out for firecrawl credit usage.
"""

load_dotenv()
crawl_params = {
    'crawlerOptions': {
        #Exclude non-english paths, image resources, etc.
        'excludes': [
                'cs',
                'da',
                'de',
                'es',
                'fi',
                'fr',
                'he',
                'hr',
                'hu',
                'it',
                'ja',
                'ko',
                'nl',
                'no',
                'pl',
                'pt',
                'ru',
                'sv',
                'th',
                'tr',
                'zh-cn'
                '.jpg',
                '.png'
                '.gif'
        ],
        'includes': ['wiki/*'],
        'limit': 75, #higher limit means more credits and more wait time.
    }
}

loader = FireCrawlLoader("https://wiki.kerbalspaceprogram.com/wiki/Main_Page",  mode="crawl", params=crawl_params)
docs = loader.load()
print("docs loaded")

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
print("split complete")

# Embed
vectorstore = Chroma.from_documents(documents=splits,
                                    embedding=OpenAIEmbeddings(),
                                    persist_directory="./rag_data/.chromadb")


print("RAG database initialized with the following sources.")
retriever = vectorstore.as_retriever()
document_data_sources = set()
for doc_metadata in retriever.vectorstore.get()['metadatas']:
    document_data_sources.add(doc_metadata['sourceURL'])
for doc in document_data_sources:
    print(f"  {doc}")