from typing import List from langchain_community.vectorstores import Chroma from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_openai import OpenAIEmbeddings from langchain_community.document_loaders import FireCrawlLoader from langchain_core.documents import Document from dotenv import load_dotenv """ Loader attempting to load documents for the game Kerbal Space program 2 from the games own fan-run wiki, using Firecrawl loader and GPT4 embedding. Firecrawl should crawl to all links from the main page (with configured limits), making easy work of the document loader (why I chose that) I chose OpenAI/GPT4 since I already pay for that so hopefully it's a bit better than free alternatives Requires OPENAI_API_KEY and FIREWALL_API_KEY in .env Code adapted from 1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb 2) https://github.com/wu4f/cs410g-src/blob/main/03_RAG/06_rag_loaddb.py 3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/ Firecrawl docs reference https://github.com/mendableai/firecrawl-py (Interestingly, this repo is only a week old at this time) This takes a while to crawl, so just run it once and watch out for firecrawl credit usage. """ load_dotenv() page_options = {"onlyMainContent": True} crawl_params = { "crawlerOptions": { # Exclude non-english paths, image resources, etc. "excludes": [ "cs", "da", "de", "es", "fi", "fr", "he", "hr", "hu", "it", "ja", "ko", "nl", "no", "pl", "pt", "ru", "sv", "th", "tr", "zh-cn", ".jpg", ".png", ".gif", ], "includes": ["wiki/*"], "limit": 75, # higher limit means more credits and more wait time. }, "pageOptions": {"onlyMainContent": True}, } loader = FireCrawlLoader( "https://wiki.kerbalspaceprogram.com/wiki/Main_Page", mode="crawl", params=crawl_params, ) docs: List[Document] = loader.load() # Split text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) splits: List[Document] = text_splitter.split_documents(docs) # This metadata incompatiblity issue should be resolved by the firecrawl maintiner (ogLocalaeAlternate is an empty list, not allowed by Chroma) for doc in splits: doc.metadata.pop("ogLocaleAlternate", None) # Embed vectorstore = Chroma.from_documents( documents=splits, embedding=OpenAIEmbeddings(), persist_directory="./rag_data/.chromadb", ) print("RAG database initialized with the following sources.") retriever = vectorstore.as_retriever() document_data_sources = set() for doc_metadata in retriever.vectorstore.get()["metadatas"]: document_data_sources.add(doc_metadata["sourceURL"]) for doc in document_data_sources: print(f" {doc}")