from langchain_community.vectorstores import Chroma from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_openai import OpenAIEmbeddings from langchain_community.document_loaders import FireCrawlLoader from dotenv import load_dotenv """ Loader attempting to load documents for the game Kerbal Space program 2 from the games own fan-run wiki, using Firecrawl loader and GPT4 embedding. Firecrawl should crawl to all links from the main page (with configured limits), making easy work of the document loader (why I chose that) I chose OpenAI/GPT4 since I already pay for that so hopefully it's a bit better than free alternatives Requires OPENAI_API_KEY and FIREWALL_API_KEY in .env Code adapted from 1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb 2) https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG 3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/ Firecrawl docs reference https://github.com/mendableai/firecrawl-py This takes a while to crawl, so just run it once and watch out for firecrawl credit usage. """ load_dotenv() crawl_params = { 'crawlerOptions': { #I want to exclude non-english paths, but this isn't working right yet. Needs work 'excludes': [ '*/cs', '*/da', '*/de', '*/es', '*/fi', '*/fr', '*/he', '*/hr', '*/hu', '*/it', '*/cs', '*/da', '*/de', '*/es', '*/fi', '*/fr', '*/he', '*/hr', '*/hu', '*/it', '*/ja', '*/ko', '*/nl', '*/no', '*/pl', '*/pt', '*/ru', '*/sv', '*/th', '*/tr', '*/zh-cn' ], 'includes': ['wiki/*'], 'limit': 20, #higher limit means more credits and more wait time. } } loader = FireCrawlLoader("https://wiki.kerbalspaceprogram.com/wiki/Main_Page", mode="crawl", params=crawl_params) docs = loader.load() print("docs loaded") # Split text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) splits = text_splitter.split_documents(docs) print("split complete") # Embed vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(), persist_directory="./rag_data/.chromadb") print("RAG database initialized with the following sources.") retriever = vectorstore.as_retriever() document_data_sources = set() for doc_metadata in retriever.vectorstore.get()['metadatas']: document_data_sources.add(doc_metadata['sourceURL']) for doc in document_data_sources: print(f" {doc}")