gensec-westgate-djw2/hw1/loader.py

from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import FireCrawlLoader
from dotenv import load_dotenv

"""
Loader attempting to load documents for the game Kerbal Space program 2 from
the games own fan-run wiki, using Firecrawl loader and GPT4 embedding.

Firecrawl should crawl to all links from the main page (with configured limits), making easy work of the document loader (why I chose that)
I chose OpenAI/GPT4 since I already pay for that so hopefully it's a bit better than free alternatives

Requires OPENAI_API_KEY and FIREWALL_API_KEY in .env

Code adapted from
1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb
2) https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG
3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/

Firecrawl docs reference
https://github.com/mendableai/firecrawl-py

This takes a while to crawl, so just run it once and watch out for firecrawl credit usage.
"""

load_dotenv()
crawl_params = {
    'crawlerOptions': {

        #I want to exclude non-english paths, but this isn't working right yet. Needs work
        'excludes': [
                    '*/cs',
                    '*/da',
                    '*/de',
                    '*/es',
                    '*/fi',
                    '*/fr',
                    '*/he',
                    '*/hr',
                    '*/hu',
                    '*/it',
                    '*/cs',
                    '*/da',
                    '*/de',
                    '*/es',
                    '*/fi',
                    '*/fr',
                    '*/he',
                    '*/hr',
                    '*/hu',
                    '*/it',
                    '*/ja',
                    '*/ko',
                    '*/nl',
                    '*/no',
                    '*/pl',
                    '*/pt',
                    '*/ru',
                    '*/sv',
                    '*/th',
                    '*/tr',
                    '*/zh-cn'
        ],
        'includes': ['wiki/*'],
        'limit': 20, #higher limit means more credits and more wait time.
    }
}

loader = FireCrawlLoader("https://wiki.kerbalspaceprogram.com/wiki/Main_Page",  mode="crawl", params=crawl_params)
docs = loader.load()
print("docs loaded")

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

print("split complete")

# Embed
vectorstore = Chroma.from_documents(documents=splits,
                                    embedding=OpenAIEmbeddings(),
                                    persist_directory="./rag_data/.chromadb")


print("RAG database initialized with the following sources.")
retriever = vectorstore.as_retriever()
document_data_sources = set()
for doc_metadata in retriever.vectorstore.get()['metadatas']:
    document_data_sources.add(doc_metadata['sourceURL'])
for doc in document_data_sources:
    print(f"  {doc}")