This repository has been archived on 2025-04-28. You can view files and clone it, but cannot push or open issues or pull requests.
gensec-westgate-djw2/hw1/loader.py
2024-04-18 13:46:49 -07:00

69 lines
2.4 KiB
Python

from langchain_community.document_loaders import AsyncHtmlLoader, DirectoryLoader, TextLoader, PyPDFDirectoryLoader, Docx2txtLoader, UnstructuredMarkdownLoader, WikipediaLoader, ArxivLoader, CSVLoader
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import WebBaseLoader
import bs4
"""
Loader attempting to load documents for the game Kerbal Space program two, both from wikipedia, as well as details from
the games own fan-run wiki, using GPT4
Code adapted from
1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb
2) https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG
"""
# vectorstore = Chroma(
# embedding_function=GoogleGenerativeAIEmbeddings(model="models/embedding-001", task_type="retrieval_query"),
# persist_directory="./rag_data/.chromadb"
# )
# Load Documents
loader = WebBaseLoader(
web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
bs_kwargs=dict(
parse_only=bs4.SoupStrainer(
class_=("post-content", "post-title", "post-header")
)
),
)
docs = loader.load()
# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
# Embed
vectorstore = Chroma.from_documents(documents=splits,
embedding=OpenAIEmbeddings())
def load_docs(docs):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=10)
splits = text_splitter.split_documents(docs)
vectorstore.add_documents(documents=splits)
def load_wikipedia(query):
load_docs(WikipediaLoader(query=query, load_max_docs=1).load())
def load_urls(urls):
load_docs(AsyncHtmlLoader(urls).load())
wiki_query = "Kerbel Space Program"
print(f"Loading Wikipedia pages on: {wiki_query}")
load_wikipedia(wiki_query)
urls = ["https://wiki.kerbalspaceprogram.com/wiki/Kerbin", "https://wiki.kerbalspaceprogram.com/wiki/Eve"]
print(f"Loading: {urls}")
load_urls(urls)
print("RAG database initialized with the following sources.")
retriever = vectorstore.as_retriever()
document_data_sources = set()
for doc_metadata in retriever.vectorstore.get()['metadatas']:
document_data_sources.add(doc_metadata['source'])
for doc in document_data_sources:
print(f" {doc}")