loader/search operational
This commit is contained in:
parent
24c5e4401b
commit
1002443082
52
hw1/app.py
52
hw1/app.py
@ -1,51 +1 @@
|
|||||||
import os
|
#todo
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
import math
|
|
||||||
import numpy
|
|
||||||
from dotenv import load_dotenv
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from nltk.tokenize import WordPunctTokenizer, RegexpTokenizer
|
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
|
||||||
|
|
||||||
from langchain import hub
|
|
||||||
from langchain.chains import LLMChain
|
|
||||||
from langchain.memory import ConversationBufferMemory
|
|
||||||
from langchain.prompts import (
|
|
||||||
MessagesPlaceholder,
|
|
||||||
HumanMessagePromptTemplate,
|
|
||||||
ChatPromptTemplate,
|
|
||||||
PromptTemplate,
|
|
||||||
)
|
|
||||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
||||||
from langchain_core.messages import HumanMessage, SystemMessage
|
|
||||||
from langchain_core.output_parsers import StrOutputParser
|
|
||||||
from langchain_core.runnables import RunnablePassthrough
|
|
||||||
from langchain_google_genai import (
|
|
||||||
GoogleGenerativeAI,
|
|
||||||
GoogleGenerativeAIEmbeddings,
|
|
||||||
ChatGoogleGenerativeAI,
|
|
||||||
HarmCategory,
|
|
||||||
HarmBlockThreshold,
|
|
||||||
)
|
|
||||||
from langchain_community.document_loaders import AsyncHtmlLoader, RecursiveUrlLoader
|
|
||||||
from langchain_community.document_transformers import BeautifulSoupTransformer
|
|
||||||
from langchain_community.vectorstores import Chroma
|
|
||||||
|
|
||||||
from langchain_openai import ChatOpenAI
|
|
||||||
from langchain_openai import OpenAI
|
|
||||||
|
|
||||||
from langchain_core.messages import HumanMessage
|
|
||||||
|
|
||||||
load_dotenv()
|
|
||||||
llm = OpenAI()
|
|
||||||
chat_model = ChatOpenAI(model="gpt-4")
|
|
||||||
|
|
||||||
text = "What is a good question to put here?"
|
|
||||||
messages = [HumanMessage(content=text)]
|
|
||||||
|
|
||||||
llm.invoke(text)
|
|
||||||
# >> Feetful of Fun
|
|
||||||
|
|
||||||
chat_model.invoke(messages)
|
|
||||||
# >> AIMessage(content="Socks O'Color")
|
|
105
hw1/loader.py
105
hw1/loader.py
@ -1,69 +1,92 @@
|
|||||||
from langchain_community.document_loaders import AsyncHtmlLoader, DirectoryLoader, TextLoader, PyPDFDirectoryLoader, Docx2txtLoader, UnstructuredMarkdownLoader, WikipediaLoader, ArxivLoader, CSVLoader
|
|
||||||
from langchain_community.vectorstores import Chroma
|
from langchain_community.vectorstores import Chroma
|
||||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
from langchain_openai import OpenAIEmbeddings
|
||||||
from langchain_community.document_loaders import WebBaseLoader
|
from langchain_community.document_loaders import FireCrawlLoader
|
||||||
import bs4
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Loader attempting to load documents for the game Kerbal Space program two, both from wikipedia, as well as details from
|
Loader attempting to load documents for the game Kerbal Space program 2 from
|
||||||
the games own fan-run wiki, using GPT4
|
the games own fan-run wiki, using Firecrawl loader and GPT4 embedding.
|
||||||
|
|
||||||
|
Firecrawl should crawl to all links from the main page (with configured limits), making easy work of the document loader (why I chose that)
|
||||||
|
I chose OpenAI/GPT4 since I already pay for that so hopefully it's a bit better than free alternatives
|
||||||
|
|
||||||
|
Requires OPENAI_API_KEY and FIREWALL_API_KEY in .env
|
||||||
|
|
||||||
Code adapted from
|
Code adapted from
|
||||||
1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb
|
1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb
|
||||||
2) https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG
|
2) https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG
|
||||||
|
3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/
|
||||||
|
|
||||||
|
Firecrawl docs reference
|
||||||
|
https://github.com/mendableai/firecrawl-py
|
||||||
|
|
||||||
|
This takes a while to crawl, so just run it once and watch out for firecrawl credit usage.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
crawl_params = {
|
||||||
|
'crawlerOptions': {
|
||||||
|
|
||||||
# vectorstore = Chroma(
|
#I want to exclude non-english paths, but this isn't working right yet. Needs work
|
||||||
# embedding_function=GoogleGenerativeAIEmbeddings(model="models/embedding-001", task_type="retrieval_query"),
|
'excludes': [
|
||||||
# persist_directory="./rag_data/.chromadb"
|
'*/cs',
|
||||||
# )
|
'*/da',
|
||||||
|
'*/de',
|
||||||
# Load Documents
|
'*/es',
|
||||||
loader = WebBaseLoader(
|
'*/fi',
|
||||||
web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
|
'*/fr',
|
||||||
bs_kwargs=dict(
|
'*/he',
|
||||||
parse_only=bs4.SoupStrainer(
|
'*/hr',
|
||||||
class_=("post-content", "post-title", "post-header")
|
'*/hu',
|
||||||
)
|
'*/it',
|
||||||
),
|
'*/cs',
|
||||||
)
|
'*/da',
|
||||||
|
'*/de',
|
||||||
|
'*/es',
|
||||||
|
'*/fi',
|
||||||
|
'*/fr',
|
||||||
|
'*/he',
|
||||||
|
'*/hr',
|
||||||
|
'*/hu',
|
||||||
|
'*/it',
|
||||||
|
'*/ja',
|
||||||
|
'*/ko',
|
||||||
|
'*/nl',
|
||||||
|
'*/no',
|
||||||
|
'*/pl',
|
||||||
|
'*/pt',
|
||||||
|
'*/ru',
|
||||||
|
'*/sv',
|
||||||
|
'*/th',
|
||||||
|
'*/tr',
|
||||||
|
'*/zh-cn'
|
||||||
|
],
|
||||||
|
'includes': ['wiki/*'],
|
||||||
|
'limit': 20, #higher limit means more credits and more wait time.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
loader = FireCrawlLoader("https://wiki.kerbalspaceprogram.com/wiki/Main_Page", mode="crawl", params=crawl_params)
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
|
print("docs loaded")
|
||||||
|
|
||||||
# Split
|
# Split
|
||||||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
||||||
splits = text_splitter.split_documents(docs)
|
splits = text_splitter.split_documents(docs)
|
||||||
|
|
||||||
|
print("split complete")
|
||||||
|
|
||||||
# Embed
|
# Embed
|
||||||
vectorstore = Chroma.from_documents(documents=splits,
|
vectorstore = Chroma.from_documents(documents=splits,
|
||||||
embedding=OpenAIEmbeddings())
|
embedding=OpenAIEmbeddings(),
|
||||||
|
persist_directory="./rag_data/.chromadb")
|
||||||
|
|
||||||
def load_docs(docs):
|
|
||||||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=10)
|
|
||||||
splits = text_splitter.split_documents(docs)
|
|
||||||
vectorstore.add_documents(documents=splits)
|
|
||||||
|
|
||||||
def load_wikipedia(query):
|
|
||||||
load_docs(WikipediaLoader(query=query, load_max_docs=1).load())
|
|
||||||
|
|
||||||
def load_urls(urls):
|
|
||||||
load_docs(AsyncHtmlLoader(urls).load())
|
|
||||||
|
|
||||||
|
|
||||||
wiki_query = "Kerbel Space Program"
|
|
||||||
print(f"Loading Wikipedia pages on: {wiki_query}")
|
|
||||||
load_wikipedia(wiki_query)
|
|
||||||
|
|
||||||
urls = ["https://wiki.kerbalspaceprogram.com/wiki/Kerbin", "https://wiki.kerbalspaceprogram.com/wiki/Eve"]
|
|
||||||
print(f"Loading: {urls}")
|
|
||||||
load_urls(urls)
|
|
||||||
|
|
||||||
print("RAG database initialized with the following sources.")
|
print("RAG database initialized with the following sources.")
|
||||||
retriever = vectorstore.as_retriever()
|
retriever = vectorstore.as_retriever()
|
||||||
document_data_sources = set()
|
document_data_sources = set()
|
||||||
for doc_metadata in retriever.vectorstore.get()['metadatas']:
|
for doc_metadata in retriever.vectorstore.get()['metadatas']:
|
||||||
document_data_sources.add(doc_metadata['source'])
|
document_data_sources.add(doc_metadata['sourceURL'])
|
||||||
for doc in document_data_sources:
|
for doc in document_data_sources:
|
||||||
print(f" {doc}")
|
print(f" {doc}")
|
43
hw1/search.py
Normal file
43
hw1/search.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
from langchain_community.vectorstores import Chroma
|
||||||
|
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
||||||
|
import readline
|
||||||
|
from langchain_community.vectorstores import Chroma
|
||||||
|
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
"""
|
||||||
|
A search utility for the loaded documents, for testing and debugging
|
||||||
|
Directly adapted from https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG with small change for env loading and OpenAI embedding
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
vectorstore = Chroma(
|
||||||
|
embedding_function=OpenAIEmbeddings(),
|
||||||
|
persist_directory="./rag_data/.chromadb"
|
||||||
|
)
|
||||||
|
|
||||||
|
def search_db(query):
|
||||||
|
docs = vectorstore.similarity_search(query)
|
||||||
|
print(f"Query database for: {query}")
|
||||||
|
if docs:
|
||||||
|
print(f"Closest document match in database: {docs[0].metadata['sourceURL']}")
|
||||||
|
else:
|
||||||
|
print("No matching documents")
|
||||||
|
|
||||||
|
print("RAG database initialized.")
|
||||||
|
retriever = vectorstore.as_retriever()
|
||||||
|
document_data_sources = set()
|
||||||
|
for doc_metadata in retriever.vectorstore.get()['metadatas']:
|
||||||
|
print(f"docm {doc_metadata}")
|
||||||
|
document_data_sources.add(doc_metadata['sourceURL'])
|
||||||
|
for doc in document_data_sources:
|
||||||
|
print(f" {doc}")
|
||||||
|
|
||||||
|
print("This program queries documents in the RAG database that are similar to whatever is entered.")
|
||||||
|
while True:
|
||||||
|
line = input(">> ")
|
||||||
|
if line:
|
||||||
|
search_db(line)
|
||||||
|
else:
|
||||||
|
break
|
Reference in New Issue
Block a user