from langchain_community.document_loaders import AsyncHtmlLoader, DirectoryLoader, TextLoader, PyPDFDirectoryLoader, Docx2txtLoader, UnstructuredMarkdownLoader, WikipediaLoader, ArxivLoader, CSVLoader from langchain_community.vectorstores import Chroma from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_openai import ChatOpenAI, OpenAIEmbeddings from langchain_community.document_loaders import WebBaseLoader import bs4 """ Loader attempting to load documents for the game Kerbal Space program two, both from wikipedia, as well as details from the games own fan-run wiki, using GPT4 Code adapted from 1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb 2) https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG """ # vectorstore = Chroma( # embedding_function=GoogleGenerativeAIEmbeddings(model="models/embedding-001", task_type="retrieval_query"), # persist_directory="./rag_data/.chromadb" # ) # Load Documents loader = WebBaseLoader( web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",), bs_kwargs=dict( parse_only=bs4.SoupStrainer( class_=("post-content", "post-title", "post-header") ) ), ) docs = loader.load() # Split text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) splits = text_splitter.split_documents(docs) # Embed vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings()) def load_docs(docs): text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=10) splits = text_splitter.split_documents(docs) vectorstore.add_documents(documents=splits) def load_wikipedia(query): load_docs(WikipediaLoader(query=query, load_max_docs=1).load()) def load_urls(urls): load_docs(AsyncHtmlLoader(urls).load()) wiki_query = "Kerbel Space Program" print(f"Loading Wikipedia pages on: {wiki_query}") load_wikipedia(wiki_query) urls = ["https://wiki.kerbalspaceprogram.com/wiki/Kerbin", "https://wiki.kerbalspaceprogram.com/wiki/Eve"] print(f"Loading: {urls}") load_urls(urls) print("RAG database initialized with the following sources.") retriever = vectorstore.as_retriever() document_data_sources = set() for doc_metadata in retriever.vectorstore.get()['metadatas']: document_data_sources.add(doc_metadata['source']) for doc in document_data_sources: print(f" {doc}")