From 1002443082c5602b173e902a5786d06515c40f02 Mon Sep 17 00:00:00 2001
From: David Westgate <davidjwestgate@gmail.com>
Date: Thu, 18 Apr 2024 22:00:59 -0700
Subject: [PATCH] loader/search operational

---
 hw1/app.py    |  52 +------------------------
 hw1/loader.py | 105 ++++++++++++++++++++++++++++++--------------------
 hw1/search.py |  43 +++++++++++++++++++++
 3 files changed, 108 insertions(+), 92 deletions(-)
 create mode 100644 hw1/search.py

diff --git a/hw1/app.py b/hw1/app.py
index 85c7702..8b68f79 100644
--- a/hw1/app.py
+++ b/hw1/app.py
@@ -1,51 +1 @@
-import os
-import sys
-import time
-import math
-import numpy
-from dotenv import load_dotenv
-from bs4 import BeautifulSoup
-from nltk.tokenize import WordPunctTokenizer, RegexpTokenizer
-from sklearn.metrics.pairwise import cosine_similarity
-
-from langchain import hub
-from langchain.chains import LLMChain
-from langchain.memory import ConversationBufferMemory
-from langchain.prompts import (
-    MessagesPlaceholder,
-    HumanMessagePromptTemplate,
-    ChatPromptTemplate,
-    PromptTemplate,
-)
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_core.messages import HumanMessage, SystemMessage
-from langchain_core.output_parsers import StrOutputParser
-from langchain_core.runnables import RunnablePassthrough
-from langchain_google_genai import (
-    GoogleGenerativeAI,
-    GoogleGenerativeAIEmbeddings,
-    ChatGoogleGenerativeAI,
-    HarmCategory,
-    HarmBlockThreshold,
-)
-from langchain_community.document_loaders import AsyncHtmlLoader, RecursiveUrlLoader
-from langchain_community.document_transformers import BeautifulSoupTransformer
-from langchain_community.vectorstores import Chroma
-
-from langchain_openai import ChatOpenAI
-from langchain_openai import OpenAI
-
-from langchain_core.messages import HumanMessage
-
-load_dotenv()
-llm = OpenAI()
-chat_model = ChatOpenAI(model="gpt-4")
-
-text = "What is a good question to put here?"
-messages = [HumanMessage(content=text)]
-
-llm.invoke(text)
-# >> Feetful of Fun
-
-chat_model.invoke(messages)
-# >> AIMessage(content="Socks O'Color")
+#todo
\ No newline at end of file
diff --git a/hw1/loader.py b/hw1/loader.py
index c503b2e..1bcd1c4 100644
--- a/hw1/loader.py
+++ b/hw1/loader.py
@@ -1,69 +1,92 @@
-from langchain_community.document_loaders import AsyncHtmlLoader, DirectoryLoader, TextLoader, PyPDFDirectoryLoader, Docx2txtLoader, UnstructuredMarkdownLoader, WikipediaLoader, ArxivLoader, CSVLoader
 from langchain_community.vectorstores import Chroma
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_openai import ChatOpenAI, OpenAIEmbeddings
-from langchain_community.document_loaders import WebBaseLoader
-import bs4
+from langchain_openai import OpenAIEmbeddings
+from langchain_community.document_loaders import FireCrawlLoader
+from dotenv import load_dotenv
 
 """
-Loader attempting to load documents for the game Kerbal Space program two, both from wikipedia, as well as details from 
-the games own fan-run wiki, using GPT4
+Loader attempting to load documents for the game Kerbal Space program 2 from 
+the games own fan-run wiki, using Firecrawl loader and GPT4 embedding.
+
+Firecrawl should crawl to all links from the main page (with configured limits), making easy work of the document loader (why I chose that)
+I chose OpenAI/GPT4 since I already pay for that so hopefully it's a bit better than free alternatives
+
+Requires OPENAI_API_KEY and FIREWALL_API_KEY in .env
 
 Code adapted from 
 1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb
 2) https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG
+3) https://python.langchain.com/docs/modules/data_connection/document_loaders/html/
+
+Firecrawl docs reference
+https://github.com/mendableai/firecrawl-py
+
+This takes a while to crawl, so just run it once and watch out for firecrawl credit usage.
 """
 
+load_dotenv()
+crawl_params = {
+    'crawlerOptions': {
 
-# vectorstore = Chroma(
-#     embedding_function=GoogleGenerativeAIEmbeddings(model="models/embedding-001", task_type="retrieval_query"),
-#     persist_directory="./rag_data/.chromadb"
-# )
-
-# Load Documents
-loader = WebBaseLoader(
-    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
-    bs_kwargs=dict(
-        parse_only=bs4.SoupStrainer(
-            class_=("post-content", "post-title", "post-header")
-        )
-    ),
-)
+        #I want to exclude non-english paths, but this isn't working right yet. Needs work
+        'excludes': [
+                    '*/cs',
+                    '*/da',
+                    '*/de',
+                    '*/es',
+                    '*/fi',
+                    '*/fr',
+                    '*/he',
+                    '*/hr',
+                    '*/hu',
+                    '*/it',
+                    '*/cs',
+                    '*/da',
+                    '*/de',
+                    '*/es',
+                    '*/fi',
+                    '*/fr',
+                    '*/he',
+                    '*/hr',
+                    '*/hu',
+                    '*/it',
+                    '*/ja',
+                    '*/ko',
+                    '*/nl',
+                    '*/no',
+                    '*/pl',
+                    '*/pt',
+                    '*/ru',
+                    '*/sv',
+                    '*/th',
+                    '*/tr',
+                    '*/zh-cn'
+        ],
+        'includes': ['wiki/*'],
+        'limit': 20, #higher limit means more credits and more wait time. 
+    }
+}
 
+loader = FireCrawlLoader("https://wiki.kerbalspaceprogram.com/wiki/Main_Page",  mode="crawl", params=crawl_params)
 docs = loader.load()
+print("docs loaded")
 
 # Split
 text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
 splits = text_splitter.split_documents(docs)
 
+print("split complete")
+
 # Embed
 vectorstore = Chroma.from_documents(documents=splits, 
-                                    embedding=OpenAIEmbeddings())
+                                    embedding=OpenAIEmbeddings(),
+                                    persist_directory="./rag_data/.chromadb")
 
-def load_docs(docs):
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=10)
-    splits = text_splitter.split_documents(docs)
-    vectorstore.add_documents(documents=splits)
-
-def load_wikipedia(query):
-    load_docs(WikipediaLoader(query=query, load_max_docs=1).load())
-
-def load_urls(urls):
-    load_docs(AsyncHtmlLoader(urls).load())
-
-
-wiki_query = "Kerbel Space Program"
-print(f"Loading Wikipedia pages on: {wiki_query}")
-load_wikipedia(wiki_query)
-
-urls = ["https://wiki.kerbalspaceprogram.com/wiki/Kerbin", "https://wiki.kerbalspaceprogram.com/wiki/Eve"]
-print(f"Loading: {urls}")
-load_urls(urls)
 
 print("RAG database initialized with the following sources.")
 retriever = vectorstore.as_retriever()
 document_data_sources = set()
 for doc_metadata in retriever.vectorstore.get()['metadatas']:
-    document_data_sources.add(doc_metadata['source']) 
+    document_data_sources.add(doc_metadata['sourceURL']) 
 for doc in document_data_sources:
     print(f"  {doc}")
\ No newline at end of file
diff --git a/hw1/search.py b/hw1/search.py
new file mode 100644
index 0000000..2ef218f
--- /dev/null
+++ b/hw1/search.py
@@ -0,0 +1,43 @@
+from langchain_community.vectorstores import Chroma
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
+import readline
+from langchain_community.vectorstores import Chroma
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from dotenv import load_dotenv
+
+"""
+A search utility for the loaded documents, for testing and debugging
+Directly adapted from https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG with small change for env loading and OpenAI embedding
+"""
+
+
+load_dotenv()
+vectorstore = Chroma(
+    embedding_function=OpenAIEmbeddings(),
+    persist_directory="./rag_data/.chromadb"
+)
+
+def search_db(query):
+    docs = vectorstore.similarity_search(query)
+    print(f"Query database for: {query}")
+    if docs:
+        print(f"Closest document match in database: {docs[0].metadata['sourceURL']}")
+    else:
+        print("No matching documents")
+
+print("RAG database initialized.")
+retriever = vectorstore.as_retriever()
+document_data_sources = set()
+for doc_metadata in retriever.vectorstore.get()['metadatas']:
+    print(f"docm {doc_metadata}")
+    document_data_sources.add(doc_metadata['sourceURL']) 
+for doc in document_data_sources:
+    print(f"  {doc}")
+
+print("This program queries documents in the RAG database that are similar to whatever is entered.")
+while True:
+    line = input(">> ")
+    if line:
+        search_db(line)
+    else:
+        break