from langchain_community.vectorstores import Chroma from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_openai import OpenAIEmbeddings from langchain_community.document_loaders import FireCrawlLoader from dotenv import load_dotenv """ Loader attempting to load documents for the game Kerbal Space program 2 from the games own fan-run wiki, using Firecrawl loader and GPT4 embedding. Firecrawl should crawl to all links from the main page (with configured limits), making easy work of the document loader (why I chose that) I chose OpenAI/GPT4 since I already pay for that so hopefully it's a bit better than free alternatives Requires OPENAI_API_KEY and FIREWALL_API_KEY in .env Code adapted from 1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb 2) https://github.com/wu4f/cs410g-src/blob/main/03_RAG/06_rag_loaddb.py 3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/ Firecrawl docs reference https://github.com/mendableai/firecrawl-py (Interestingly, this repo is only a week old at this time) This takes a while to crawl, so just run it once and watch out for firecrawl credit usage. """ load_dotenv() crawl_params = { 'crawlerOptions': { #Exclude non-english paths, image resources, etc. 'excludes': [ 'cs', 'da', 'de', 'es', 'fi', 'fr', 'he', 'hr', 'hu', 'it', 'ja', 'ko', 'nl', 'no', 'pl', 'pt', 'ru', 'sv', 'th', 'tr', 'zh-cn' '.jpg', '.png' '.gif' ], 'includes': ['wiki/*'], 'limit': 75, #higher limit means more credits and more wait time. } } loader = FireCrawlLoader("https://wiki.kerbalspaceprogram.com/wiki/Main_Page", mode="crawl", params=crawl_params) docs = loader.load() print("docs loaded") # Split text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) splits = text_splitter.split_documents(docs) print("split complete") # Embed vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(), persist_directory="./rag_data/.chromadb") print("RAG database initialized with the following sources.") retriever = vectorstore.as_retriever() document_data_sources = set() for doc_metadata in retriever.vectorstore.get()['metadatas']: document_data_sources.add(doc_metadata['sourceURL']) for doc in document_data_sources: print(f" {doc}")