From 24c5e4401b290924e1e5a16e1aecc0bffba6c42b Mon Sep 17 00:00:00 2001 From: David Westgate Date: Thu, 18 Apr 2024 13:46:49 -0700 Subject: [PATCH] started work on the loader --- hw1/app.py | 51 +++++++++++++++++++++++++++++++++++++ hw1/loader.py | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 hw1/loader.py diff --git a/hw1/app.py b/hw1/app.py index e69de29..85c7702 100644 --- a/hw1/app.py +++ b/hw1/app.py @@ -0,0 +1,51 @@ +import os +import sys +import time +import math +import numpy +from dotenv import load_dotenv +from bs4 import BeautifulSoup +from nltk.tokenize import WordPunctTokenizer, RegexpTokenizer +from sklearn.metrics.pairwise import cosine_similarity + +from langchain import hub +from langchain.chains import LLMChain +from langchain.memory import ConversationBufferMemory +from langchain.prompts import ( + MessagesPlaceholder, + HumanMessagePromptTemplate, + ChatPromptTemplate, + PromptTemplate, +) +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_core.messages import HumanMessage, SystemMessage +from langchain_core.output_parsers import StrOutputParser +from langchain_core.runnables import RunnablePassthrough +from langchain_google_genai import ( + GoogleGenerativeAI, + GoogleGenerativeAIEmbeddings, + ChatGoogleGenerativeAI, + HarmCategory, + HarmBlockThreshold, +) +from langchain_community.document_loaders import AsyncHtmlLoader, RecursiveUrlLoader +from langchain_community.document_transformers import BeautifulSoupTransformer +from langchain_community.vectorstores import Chroma + +from langchain_openai import ChatOpenAI +from langchain_openai import OpenAI + +from langchain_core.messages import HumanMessage + +load_dotenv() +llm = OpenAI() +chat_model = ChatOpenAI(model="gpt-4") + +text = "What is a good question to put here?" +messages = [HumanMessage(content=text)] + +llm.invoke(text) +# >> Feetful of Fun + +chat_model.invoke(messages) +# >> AIMessage(content="Socks O'Color") diff --git a/hw1/loader.py b/hw1/loader.py new file mode 100644 index 0000000..c503b2e --- /dev/null +++ b/hw1/loader.py @@ -0,0 +1,69 @@ +from langchain_community.document_loaders import AsyncHtmlLoader, DirectoryLoader, TextLoader, PyPDFDirectoryLoader, Docx2txtLoader, UnstructuredMarkdownLoader, WikipediaLoader, ArxivLoader, CSVLoader +from langchain_community.vectorstores import Chroma +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_openai import ChatOpenAI, OpenAIEmbeddings +from langchain_community.document_loaders import WebBaseLoader +import bs4 + +""" +Loader attempting to load documents for the game Kerbal Space program two, both from wikipedia, as well as details from +the games own fan-run wiki, using GPT4 + +Code adapted from +1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb +2) https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG +""" + + +# vectorstore = Chroma( +# embedding_function=GoogleGenerativeAIEmbeddings(model="models/embedding-001", task_type="retrieval_query"), +# persist_directory="./rag_data/.chromadb" +# ) + +# Load Documents +loader = WebBaseLoader( + web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",), + bs_kwargs=dict( + parse_only=bs4.SoupStrainer( + class_=("post-content", "post-title", "post-header") + ) + ), +) + +docs = loader.load() + +# Split +text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) +splits = text_splitter.split_documents(docs) + +# Embed +vectorstore = Chroma.from_documents(documents=splits, + embedding=OpenAIEmbeddings()) + +def load_docs(docs): + text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=10) + splits = text_splitter.split_documents(docs) + vectorstore.add_documents(documents=splits) + +def load_wikipedia(query): + load_docs(WikipediaLoader(query=query, load_max_docs=1).load()) + +def load_urls(urls): + load_docs(AsyncHtmlLoader(urls).load()) + + +wiki_query = "Kerbel Space Program" +print(f"Loading Wikipedia pages on: {wiki_query}") +load_wikipedia(wiki_query) + +urls = ["https://wiki.kerbalspaceprogram.com/wiki/Kerbin", "https://wiki.kerbalspaceprogram.com/wiki/Eve"] +print(f"Loading: {urls}") +load_urls(urls) + +print("RAG database initialized with the following sources.") +retriever = vectorstore.as_retriever() +document_data_sources = set() +for doc_metadata in retriever.vectorstore.get()['metadatas']: + document_data_sources.add(doc_metadata['source']) +for doc in document_data_sources: + print(f" {doc}") \ No newline at end of file