From 1002443082c5602b173e902a5786d06515c40f02 Mon Sep 17 00:00:00 2001 From: David Westgate Date: Thu, 18 Apr 2024 22:00:59 -0700 Subject: [PATCH] loader/search operational --- hw1/app.py | 52 +------------------------ hw1/loader.py | 105 ++++++++++++++++++++++++++++++-------------------- hw1/search.py | 43 +++++++++++++++++++++ 3 files changed, 108 insertions(+), 92 deletions(-) create mode 100644 hw1/search.py diff --git a/hw1/app.py b/hw1/app.py index 85c7702..8b68f79 100644 --- a/hw1/app.py +++ b/hw1/app.py @@ -1,51 +1 @@ -import os -import sys -import time -import math -import numpy -from dotenv import load_dotenv -from bs4 import BeautifulSoup -from nltk.tokenize import WordPunctTokenizer, RegexpTokenizer -from sklearn.metrics.pairwise import cosine_similarity - -from langchain import hub -from langchain.chains import LLMChain -from langchain.memory import ConversationBufferMemory -from langchain.prompts import ( - MessagesPlaceholder, - HumanMessagePromptTemplate, - ChatPromptTemplate, - PromptTemplate, -) -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain_core.messages import HumanMessage, SystemMessage -from langchain_core.output_parsers import StrOutputParser -from langchain_core.runnables import RunnablePassthrough -from langchain_google_genai import ( - GoogleGenerativeAI, - GoogleGenerativeAIEmbeddings, - ChatGoogleGenerativeAI, - HarmCategory, - HarmBlockThreshold, -) -from langchain_community.document_loaders import AsyncHtmlLoader, RecursiveUrlLoader -from langchain_community.document_transformers import BeautifulSoupTransformer -from langchain_community.vectorstores import Chroma - -from langchain_openai import ChatOpenAI -from langchain_openai import OpenAI - -from langchain_core.messages import HumanMessage - -load_dotenv() -llm = OpenAI() -chat_model = ChatOpenAI(model="gpt-4") - -text = "What is a good question to put here?" -messages = [HumanMessage(content=text)] - -llm.invoke(text) -# >> Feetful of Fun - -chat_model.invoke(messages) -# >> AIMessage(content="Socks O'Color") +#todo \ No newline at end of file diff --git a/hw1/loader.py b/hw1/loader.py index c503b2e..1bcd1c4 100644 --- a/hw1/loader.py +++ b/hw1/loader.py @@ -1,69 +1,92 @@ -from langchain_community.document_loaders import AsyncHtmlLoader, DirectoryLoader, TextLoader, PyPDFDirectoryLoader, Docx2txtLoader, UnstructuredMarkdownLoader, WikipediaLoader, ArxivLoader, CSVLoader from langchain_community.vectorstores import Chroma from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain_openai import ChatOpenAI, OpenAIEmbeddings -from langchain_community.document_loaders import WebBaseLoader -import bs4 +from langchain_openai import OpenAIEmbeddings +from langchain_community.document_loaders import FireCrawlLoader +from dotenv import load_dotenv """ -Loader attempting to load documents for the game Kerbal Space program two, both from wikipedia, as well as details from -the games own fan-run wiki, using GPT4 +Loader attempting to load documents for the game Kerbal Space program 2 from +the games own fan-run wiki, using Firecrawl loader and GPT4 embedding. + +Firecrawl should crawl to all links from the main page (with configured limits), making easy work of the document loader (why I chose that) +I chose OpenAI/GPT4 since I already pay for that so hopefully it's a bit better than free alternatives + +Requires OPENAI_API_KEY and FIREWALL_API_KEY in .env Code adapted from 1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb 2) https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG +3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/ + +Firecrawl docs reference +https://github.com/mendableai/firecrawl-py + +This takes a while to crawl, so just run it once and watch out for firecrawl credit usage. """ +load_dotenv() +crawl_params = { + 'crawlerOptions': { -# vectorstore = Chroma( -# embedding_function=GoogleGenerativeAIEmbeddings(model="models/embedding-001", task_type="retrieval_query"), -# persist_directory="./rag_data/.chromadb" -# ) - -# Load Documents -loader = WebBaseLoader( - web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",), - bs_kwargs=dict( - parse_only=bs4.SoupStrainer( - class_=("post-content", "post-title", "post-header") - ) - ), -) + #I want to exclude non-english paths, but this isn't working right yet. Needs work + 'excludes': [ + '*/cs', + '*/da', + '*/de', + '*/es', + '*/fi', + '*/fr', + '*/he', + '*/hr', + '*/hu', + '*/it', + '*/cs', + '*/da', + '*/de', + '*/es', + '*/fi', + '*/fr', + '*/he', + '*/hr', + '*/hu', + '*/it', + '*/ja', + '*/ko', + '*/nl', + '*/no', + '*/pl', + '*/pt', + '*/ru', + '*/sv', + '*/th', + '*/tr', + '*/zh-cn' + ], + 'includes': ['wiki/*'], + 'limit': 20, #higher limit means more credits and more wait time. + } +} +loader = FireCrawlLoader("https://wiki.kerbalspaceprogram.com/wiki/Main_Page", mode="crawl", params=crawl_params) docs = loader.load() +print("docs loaded") # Split text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) splits = text_splitter.split_documents(docs) +print("split complete") + # Embed vectorstore = Chroma.from_documents(documents=splits, - embedding=OpenAIEmbeddings()) + embedding=OpenAIEmbeddings(), + persist_directory="./rag_data/.chromadb") -def load_docs(docs): - text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=10) - splits = text_splitter.split_documents(docs) - vectorstore.add_documents(documents=splits) - -def load_wikipedia(query): - load_docs(WikipediaLoader(query=query, load_max_docs=1).load()) - -def load_urls(urls): - load_docs(AsyncHtmlLoader(urls).load()) - - -wiki_query = "Kerbel Space Program" -print(f"Loading Wikipedia pages on: {wiki_query}") -load_wikipedia(wiki_query) - -urls = ["https://wiki.kerbalspaceprogram.com/wiki/Kerbin", "https://wiki.kerbalspaceprogram.com/wiki/Eve"] -print(f"Loading: {urls}") -load_urls(urls) print("RAG database initialized with the following sources.") retriever = vectorstore.as_retriever() document_data_sources = set() for doc_metadata in retriever.vectorstore.get()['metadatas']: - document_data_sources.add(doc_metadata['source']) + document_data_sources.add(doc_metadata['sourceURL']) for doc in document_data_sources: print(f" {doc}") \ No newline at end of file diff --git a/hw1/search.py b/hw1/search.py new file mode 100644 index 0000000..2ef218f --- /dev/null +++ b/hw1/search.py @@ -0,0 +1,43 @@ +from langchain_community.vectorstores import Chroma +from langchain_google_genai import GoogleGenerativeAIEmbeddings +import readline +from langchain_community.vectorstores import Chroma +from langchain_openai import ChatOpenAI, OpenAIEmbeddings +from dotenv import load_dotenv + +""" +A search utility for the loaded documents, for testing and debugging +Directly adapted from https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG with small change for env loading and OpenAI embedding +""" + + +load_dotenv() +vectorstore = Chroma( + embedding_function=OpenAIEmbeddings(), + persist_directory="./rag_data/.chromadb" +) + +def search_db(query): + docs = vectorstore.similarity_search(query) + print(f"Query database for: {query}") + if docs: + print(f"Closest document match in database: {docs[0].metadata['sourceURL']}") + else: + print("No matching documents") + +print("RAG database initialized.") +retriever = vectorstore.as_retriever() +document_data_sources = set() +for doc_metadata in retriever.vectorstore.get()['metadatas']: + print(f"docm {doc_metadata}") + document_data_sources.add(doc_metadata['sourceURL']) +for doc in document_data_sources: + print(f" {doc}") + +print("This program queries documents in the RAG database that are similar to whatever is entered.") +while True: + line = input(">> ") + if line: + search_db(line) + else: + break