This repository has been archived on 2025-04-28. You can view files and clone it, but cannot push or open issues or pull requests.
gensec-westgate-djw2/hw1/search.py

45 lines
1.3 KiB
Python

from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
"""
A search utility for the loaded documents, for testing and debugging
Adapted from https://github.com/wu4f/cs410g-src/blob/main/03_RAG/07_rag_docsearch.py with small change for env loading and OpenAI embedding
"""
load_dotenv()
vectorstore = Chroma(
embedding_function=OpenAIEmbeddings(), persist_directory="./rag_data/.chromadb"
)
def search_db(query):
docs = vectorstore.similarity_search(query)
print(f"Query database for: {query}")
if docs:
print(f"Closest document match in database: {docs[0].metadata['sourceURL']}")
else:
print("No matching documents")
print("RAG database initialized.")
retriever = vectorstore.as_retriever()
document_data_sources = set()
for doc_metadata in retriever.vectorstore.get()["metadatas"]:
print(f"docm {doc_metadata}")
document_data_sources.add(doc_metadata["sourceURL"])
for doc in document_data_sources:
print(f" {doc}")
print(
"This program queries documents in the RAG database that are similar to whatever is entered."
)
while True:
line = input(">> ")
if line:
search_db(line)
else:
break