loader/search operational

This commit is contained in:
David Westgate 2024-04-18 22:00:59 -07:00
parent 24c5e4401b
commit 1002443082
3 changed files with 108 additions and 92 deletions

View File

@ -1,51 +1 @@
import os
import sys
import time
import math
import numpy
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer, RegexpTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from langchain import hub
from langchain.chains import LLMChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import (
MessagesPlaceholder,
HumanMessagePromptTemplate,
ChatPromptTemplate,
PromptTemplate,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import (
GoogleGenerativeAI,
GoogleGenerativeAIEmbeddings,
ChatGoogleGenerativeAI,
HarmCategory,
HarmBlockThreshold,
)
from langchain_community.document_loaders import AsyncHtmlLoader, RecursiveUrlLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAI
from langchain_core.messages import HumanMessage
load_dotenv()
llm = OpenAI()
chat_model = ChatOpenAI(model="gpt-4")
text = "What is a good question to put here?"
messages = [HumanMessage(content=text)]
llm.invoke(text)
# >> Feetful of Fun
chat_model.invoke(messages)
# >> AIMessage(content="Socks O'Color")
#todo

View File

@ -1,69 +1,92 @@
from langchain_community.document_loaders import AsyncHtmlLoader, DirectoryLoader, TextLoader, PyPDFDirectoryLoader, Docx2txtLoader, UnstructuredMarkdownLoader, WikipediaLoader, ArxivLoader, CSVLoader
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import WebBaseLoader
import bs4
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import FireCrawlLoader
from dotenv import load_dotenv
"""
Loader attempting to load documents for the game Kerbal Space program two, both from wikipedia, as well as details from
the games own fan-run wiki, using GPT4
Loader attempting to load documents for the game Kerbal Space program 2 from
the games own fan-run wiki, using Firecrawl loader and GPT4 embedding.
Firecrawl should crawl to all links from the main page (with configured limits), making easy work of the document loader (why I chose that)
I chose OpenAI/GPT4 since I already pay for that so hopefully it's a bit better than free alternatives
Requires OPENAI_API_KEY and FIREWALL_API_KEY in .env
Code adapted from
1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb
2) https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG
3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/
Firecrawl docs reference
https://github.com/mendableai/firecrawl-py
This takes a while to crawl, so just run it once and watch out for firecrawl credit usage.
"""
load_dotenv()
crawl_params = {
'crawlerOptions': {
# vectorstore = Chroma(
# embedding_function=GoogleGenerativeAIEmbeddings(model="models/embedding-001", task_type="retrieval_query"),
# persist_directory="./rag_data/.chromadb"
# )
# Load Documents
loader = WebBaseLoader(
web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
bs_kwargs=dict(
parse_only=bs4.SoupStrainer(
class_=("post-content", "post-title", "post-header")
)
),
)
#I want to exclude non-english paths, but this isn't working right yet. Needs work
'excludes': [
'*/cs',
'*/da',
'*/de',
'*/es',
'*/fi',
'*/fr',
'*/he',
'*/hr',
'*/hu',
'*/it',
'*/cs',
'*/da',
'*/de',
'*/es',
'*/fi',
'*/fr',
'*/he',
'*/hr',
'*/hu',
'*/it',
'*/ja',
'*/ko',
'*/nl',
'*/no',
'*/pl',
'*/pt',
'*/ru',
'*/sv',
'*/th',
'*/tr',
'*/zh-cn'
],
'includes': ['wiki/*'],
'limit': 20, #higher limit means more credits and more wait time.
}
}
loader = FireCrawlLoader("https://wiki.kerbalspaceprogram.com/wiki/Main_Page", mode="crawl", params=crawl_params)
docs = loader.load()
print("docs loaded")
# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
print("split complete")
# Embed
vectorstore = Chroma.from_documents(documents=splits,
embedding=OpenAIEmbeddings())
embedding=OpenAIEmbeddings(),
persist_directory="./rag_data/.chromadb")
def load_docs(docs):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=10)
splits = text_splitter.split_documents(docs)
vectorstore.add_documents(documents=splits)
def load_wikipedia(query):
load_docs(WikipediaLoader(query=query, load_max_docs=1).load())
def load_urls(urls):
load_docs(AsyncHtmlLoader(urls).load())
wiki_query = "Kerbel Space Program"
print(f"Loading Wikipedia pages on: {wiki_query}")
load_wikipedia(wiki_query)
urls = ["https://wiki.kerbalspaceprogram.com/wiki/Kerbin", "https://wiki.kerbalspaceprogram.com/wiki/Eve"]
print(f"Loading: {urls}")
load_urls(urls)
print("RAG database initialized with the following sources.")
retriever = vectorstore.as_retriever()
document_data_sources = set()
for doc_metadata in retriever.vectorstore.get()['metadatas']:
document_data_sources.add(doc_metadata['source'])
document_data_sources.add(doc_metadata['sourceURL'])
for doc in document_data_sources:
print(f" {doc}")

43
hw1/search.py Normal file
View File

@ -0,0 +1,43 @@
from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import readline
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from dotenv import load_dotenv
"""
A search utility for the loaded documents, for testing and debugging
Directly adapted from https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG with small change for env loading and OpenAI embedding
"""
load_dotenv()
vectorstore = Chroma(
embedding_function=OpenAIEmbeddings(),
persist_directory="./rag_data/.chromadb"
)
def search_db(query):
docs = vectorstore.similarity_search(query)
print(f"Query database for: {query}")
if docs:
print(f"Closest document match in database: {docs[0].metadata['sourceURL']}")
else:
print("No matching documents")
print("RAG database initialized.")
retriever = vectorstore.as_retriever()
document_data_sources = set()
for doc_metadata in retriever.vectorstore.get()['metadatas']:
print(f"docm {doc_metadata}")
document_data_sources.add(doc_metadata['sourceURL'])
for doc in document_data_sources:
print(f" {doc}")
print("This program queries documents in the RAG database that are similar to whatever is entered.")
while True:
line = input(">> ")
if line:
search_db(line)
else:
break