92 lines
3.1 KiB
Python
92 lines
3.1 KiB
Python
from langchain_community.vectorstores import Chroma
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from langchain_openai import OpenAIEmbeddings
|
|
from langchain_community.document_loaders import FireCrawlLoader
|
|
from dotenv import load_dotenv
|
|
|
|
"""
|
|
Loader attempting to load documents for the game Kerbal Space program 2 from
|
|
the games own fan-run wiki, using Firecrawl loader and GPT4 embedding.
|
|
|
|
Firecrawl should crawl to all links from the main page (with configured limits), making easy work of the document loader (why I chose that)
|
|
I chose OpenAI/GPT4 since I already pay for that so hopefully it's a bit better than free alternatives
|
|
|
|
Requires OPENAI_API_KEY and FIREWALL_API_KEY in .env
|
|
|
|
Code adapted from
|
|
1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb
|
|
2) https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG
|
|
3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/
|
|
|
|
Firecrawl docs reference
|
|
https://github.com/mendableai/firecrawl-py
|
|
|
|
This takes a while to crawl, so just run it once and watch out for firecrawl credit usage.
|
|
"""
|
|
|
|
load_dotenv()
|
|
crawl_params = {
|
|
'crawlerOptions': {
|
|
|
|
#I want to exclude non-english paths, but this isn't working right yet. Needs work
|
|
'excludes': [
|
|
'*/cs',
|
|
'*/da',
|
|
'*/de',
|
|
'*/es',
|
|
'*/fi',
|
|
'*/fr',
|
|
'*/he',
|
|
'*/hr',
|
|
'*/hu',
|
|
'*/it',
|
|
'*/cs',
|
|
'*/da',
|
|
'*/de',
|
|
'*/es',
|
|
'*/fi',
|
|
'*/fr',
|
|
'*/he',
|
|
'*/hr',
|
|
'*/hu',
|
|
'*/it',
|
|
'*/ja',
|
|
'*/ko',
|
|
'*/nl',
|
|
'*/no',
|
|
'*/pl',
|
|
'*/pt',
|
|
'*/ru',
|
|
'*/sv',
|
|
'*/th',
|
|
'*/tr',
|
|
'*/zh-cn'
|
|
],
|
|
'includes': ['wiki/*'],
|
|
'limit': 20, #higher limit means more credits and more wait time.
|
|
}
|
|
}
|
|
|
|
loader = FireCrawlLoader("https://wiki.kerbalspaceprogram.com/wiki/Main_Page", mode="crawl", params=crawl_params)
|
|
docs = loader.load()
|
|
print("docs loaded")
|
|
|
|
# Split
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
|
splits = text_splitter.split_documents(docs)
|
|
|
|
print("split complete")
|
|
|
|
# Embed
|
|
vectorstore = Chroma.from_documents(documents=splits,
|
|
embedding=OpenAIEmbeddings(),
|
|
persist_directory="./rag_data/.chromadb")
|
|
|
|
|
|
print("RAG database initialized with the following sources.")
|
|
retriever = vectorstore.as_retriever()
|
|
document_data_sources = set()
|
|
for doc_metadata in retriever.vectorstore.get()['metadatas']:
|
|
document_data_sources.add(doc_metadata['sourceURL'])
|
|
for doc in document_data_sources:
|
|
print(f" {doc}") |