loader/search operational

This commit is contained in:
David Westgate 2024-04-18 22:00:59 -07:00
parent 24c5e4401b
commit 1002443082
3 changed files with 108 additions and 92 deletions

View File

@ -1,51 +1 @@
import os #todo
import sys
import time
import math
import numpy
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer, RegexpTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from langchain import hub
from langchain.chains import LLMChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import (
MessagesPlaceholder,
HumanMessagePromptTemplate,
ChatPromptTemplate,
PromptTemplate,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import (
GoogleGenerativeAI,
GoogleGenerativeAIEmbeddings,
ChatGoogleGenerativeAI,
HarmCategory,
HarmBlockThreshold,
)
from langchain_community.document_loaders import AsyncHtmlLoader, RecursiveUrlLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAI
from langchain_core.messages import HumanMessage
load_dotenv()
llm = OpenAI()
chat_model = ChatOpenAI(model="gpt-4")
text = "What is a good question to put here?"
messages = [HumanMessage(content=text)]
llm.invoke(text)
# >> Feetful of Fun
chat_model.invoke(messages)
# >> AIMessage(content="Socks O'Color")

View File

@ -1,69 +1,92 @@
from langchain_community.document_loaders import AsyncHtmlLoader, DirectoryLoader, TextLoader, PyPDFDirectoryLoader, Docx2txtLoader, UnstructuredMarkdownLoader, WikipediaLoader, ArxivLoader, CSVLoader
from langchain_community.vectorstores import Chroma from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import WebBaseLoader from langchain_community.document_loaders import FireCrawlLoader
import bs4 from dotenv import load_dotenv
""" """
Loader attempting to load documents for the game Kerbal Space program two, both from wikipedia, as well as details from Loader attempting to load documents for the game Kerbal Space program 2 from
the games own fan-run wiki, using GPT4 the games own fan-run wiki, using Firecrawl loader and GPT4 embedding.
Firecrawl should crawl to all links from the main page (with configured limits), making easy work of the document loader (why I chose that)
I chose OpenAI/GPT4 since I already pay for that so hopefully it's a bit better than free alternatives
Requires OPENAI_API_KEY and FIREWALL_API_KEY in .env
Code adapted from Code adapted from
1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb 1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb
2) https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG 2) https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG
3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/
Firecrawl docs reference
https://github.com/mendableai/firecrawl-py
This takes a while to crawl, so just run it once and watch out for firecrawl credit usage.
""" """
load_dotenv()
crawl_params = {
'crawlerOptions': {
# vectorstore = Chroma( #I want to exclude non-english paths, but this isn't working right yet. Needs work
# embedding_function=GoogleGenerativeAIEmbeddings(model="models/embedding-001", task_type="retrieval_query"), 'excludes': [
# persist_directory="./rag_data/.chromadb" '*/cs',
# ) '*/da',
'*/de',
# Load Documents '*/es',
loader = WebBaseLoader( '*/fi',
web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",), '*/fr',
bs_kwargs=dict( '*/he',
parse_only=bs4.SoupStrainer( '*/hr',
class_=("post-content", "post-title", "post-header") '*/hu',
) '*/it',
), '*/cs',
) '*/da',
'*/de',
'*/es',
'*/fi',
'*/fr',
'*/he',
'*/hr',
'*/hu',
'*/it',
'*/ja',
'*/ko',
'*/nl',
'*/no',
'*/pl',
'*/pt',
'*/ru',
'*/sv',
'*/th',
'*/tr',
'*/zh-cn'
],
'includes': ['wiki/*'],
'limit': 20, #higher limit means more credits and more wait time.
}
}
loader = FireCrawlLoader("https://wiki.kerbalspaceprogram.com/wiki/Main_Page", mode="crawl", params=crawl_params)
docs = loader.load() docs = loader.load()
print("docs loaded")
# Split # Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs) splits = text_splitter.split_documents(docs)
print("split complete")
# Embed # Embed
vectorstore = Chroma.from_documents(documents=splits, vectorstore = Chroma.from_documents(documents=splits,
embedding=OpenAIEmbeddings()) embedding=OpenAIEmbeddings(),
persist_directory="./rag_data/.chromadb")
def load_docs(docs):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=10)
splits = text_splitter.split_documents(docs)
vectorstore.add_documents(documents=splits)
def load_wikipedia(query):
load_docs(WikipediaLoader(query=query, load_max_docs=1).load())
def load_urls(urls):
load_docs(AsyncHtmlLoader(urls).load())
wiki_query = "Kerbel Space Program"
print(f"Loading Wikipedia pages on: {wiki_query}")
load_wikipedia(wiki_query)
urls = ["https://wiki.kerbalspaceprogram.com/wiki/Kerbin", "https://wiki.kerbalspaceprogram.com/wiki/Eve"]
print(f"Loading: {urls}")
load_urls(urls)
print("RAG database initialized with the following sources.") print("RAG database initialized with the following sources.")
retriever = vectorstore.as_retriever() retriever = vectorstore.as_retriever()
document_data_sources = set() document_data_sources = set()
for doc_metadata in retriever.vectorstore.get()['metadatas']: for doc_metadata in retriever.vectorstore.get()['metadatas']:
document_data_sources.add(doc_metadata['source']) document_data_sources.add(doc_metadata['sourceURL'])
for doc in document_data_sources: for doc in document_data_sources:
print(f" {doc}") print(f" {doc}")

43
hw1/search.py Normal file
View File

@ -0,0 +1,43 @@
from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import readline
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from dotenv import load_dotenv
"""
A search utility for the loaded documents, for testing and debugging
Directly adapted from https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG with small change for env loading and OpenAI embedding
"""
load_dotenv()
vectorstore = Chroma(
embedding_function=OpenAIEmbeddings(),
persist_directory="./rag_data/.chromadb"
)
def search_db(query):
docs = vectorstore.similarity_search(query)
print(f"Query database for: {query}")
if docs:
print(f"Closest document match in database: {docs[0].metadata['sourceURL']}")
else:
print("No matching documents")
print("RAG database initialized.")
retriever = vectorstore.as_retriever()
document_data_sources = set()
for doc_metadata in retriever.vectorstore.get()['metadatas']:
print(f"docm {doc_metadata}")
document_data_sources.add(doc_metadata['sourceURL'])
for doc in document_data_sources:
print(f" {doc}")
print("This program queries documents in the RAG database that are similar to whatever is entered.")
while True:
line = input(">> ")
if line:
search_db(line)
else:
break