This repository has been archived on 2025-04-28. You can view files and clone it, but cannot push or open issues or pull requests.
gensec-westgate-djw2/hw1/loader.py

84 lines
2.8 KiB
Python

from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import FireCrawlLoader
from dotenv import load_dotenv
"""
Loader attempting to load documents for the game Kerbal Space program 2 from
the games own fan-run wiki, using Firecrawl loader and GPT4 embedding.
Firecrawl should crawl to all links from the main page (with configured limits), making easy work of the document loader (why I chose that)
I chose OpenAI/GPT4 since I already pay for that so hopefully it's a bit better than free alternatives
Requires OPENAI_API_KEY and FIREWALL_API_KEY in .env
Code adapted from
1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb
2) https://github.com/wu4f/cs410g-src/blob/main/03_RAG/06_rag_loaddb.py
3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/
Firecrawl docs reference
https://github.com/mendableai/firecrawl-py
(Interestingly, this repo is only a week old at this time)
This takes a while to crawl, so just run it once and watch out for firecrawl credit usage.
"""
load_dotenv()
crawl_params = {
'crawlerOptions': {
#Exclude non-english paths, image resources, etc.
'excludes': [
'cs',
'da',
'de',
'es',
'fi',
'fr',
'he',
'hr',
'hu',
'it',
'ja',
'ko',
'nl',
'no',
'pl',
'pt',
'ru',
'sv',
'th',
'tr',
'zh-cn'
'.jpg',
'.png'
'.gif'
],
'includes': ['wiki/*'],
'limit': 75, #higher limit means more credits and more wait time.
}
}
loader = FireCrawlLoader("https://wiki.kerbalspaceprogram.com/wiki/Main_Page", mode="crawl", params=crawl_params)
docs = loader.load()
print("docs loaded")
# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
print("split complete")
# Embed
vectorstore = Chroma.from_documents(documents=splits,
embedding=OpenAIEmbeddings(),
persist_directory="./rag_data/.chromadb")
print("RAG database initialized with the following sources.")
retriever = vectorstore.as_retriever()
document_data_sources = set()
for doc_metadata in retriever.vectorstore.get()['metadatas']:
document_data_sources.add(doc_metadata['sourceURL'])
for doc in document_data_sources:
print(f" {doc}")