started work on the loader
This commit is contained in:
parent
de9838badc
commit
24c5e4401b
51
hw1/app.py
51
hw1/app.py
@ -0,0 +1,51 @@
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import math
|
||||
import numpy
|
||||
from dotenv import load_dotenv
|
||||
from bs4 import BeautifulSoup
|
||||
from nltk.tokenize import WordPunctTokenizer, RegexpTokenizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
from langchain import hub
|
||||
from langchain.chains import LLMChain
|
||||
from langchain.memory import ConversationBufferMemory
|
||||
from langchain.prompts import (
|
||||
MessagesPlaceholder,
|
||||
HumanMessagePromptTemplate,
|
||||
ChatPromptTemplate,
|
||||
PromptTemplate,
|
||||
)
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain_core.messages import HumanMessage, SystemMessage
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.runnables import RunnablePassthrough
|
||||
from langchain_google_genai import (
|
||||
GoogleGenerativeAI,
|
||||
GoogleGenerativeAIEmbeddings,
|
||||
ChatGoogleGenerativeAI,
|
||||
HarmCategory,
|
||||
HarmBlockThreshold,
|
||||
)
|
||||
from langchain_community.document_loaders import AsyncHtmlLoader, RecursiveUrlLoader
|
||||
from langchain_community.document_transformers import BeautifulSoupTransformer
|
||||
from langchain_community.vectorstores import Chroma
|
||||
|
||||
from langchain_openai import ChatOpenAI
|
||||
from langchain_openai import OpenAI
|
||||
|
||||
from langchain_core.messages import HumanMessage
|
||||
|
||||
load_dotenv()
|
||||
llm = OpenAI()
|
||||
chat_model = ChatOpenAI(model="gpt-4")
|
||||
|
||||
text = "What is a good question to put here?"
|
||||
messages = [HumanMessage(content=text)]
|
||||
|
||||
llm.invoke(text)
|
||||
# >> Feetful of Fun
|
||||
|
||||
chat_model.invoke(messages)
|
||||
# >> AIMessage(content="Socks O'Color")
|
69
hw1/loader.py
Normal file
69
hw1/loader.py
Normal file
@ -0,0 +1,69 @@
|
||||
from langchain_community.document_loaders import AsyncHtmlLoader, DirectoryLoader, TextLoader, PyPDFDirectoryLoader, Docx2txtLoader, UnstructuredMarkdownLoader, WikipediaLoader, ArxivLoader, CSVLoader
|
||||
from langchain_community.vectorstores import Chroma
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
||||
from langchain_community.document_loaders import WebBaseLoader
|
||||
import bs4
|
||||
|
||||
"""
|
||||
Loader attempting to load documents for the game Kerbal Space program two, both from wikipedia, as well as details from
|
||||
the games own fan-run wiki, using GPT4
|
||||
|
||||
Code adapted from
|
||||
1) https://github.com/langchain-ai/rag-from-scratch/blob/main/rag_from_scratch_1_to_4.ipynb
|
||||
2) https://codelabs.cs.pdx.edu/labs/G2.3_LangChainRAG
|
||||
"""
|
||||
|
||||
|
||||
# vectorstore = Chroma(
|
||||
# embedding_function=GoogleGenerativeAIEmbeddings(model="models/embedding-001", task_type="retrieval_query"),
|
||||
# persist_directory="./rag_data/.chromadb"
|
||||
# )
|
||||
|
||||
# Load Documents
|
||||
loader = WebBaseLoader(
|
||||
web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
|
||||
bs_kwargs=dict(
|
||||
parse_only=bs4.SoupStrainer(
|
||||
class_=("post-content", "post-title", "post-header")
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
docs = loader.load()
|
||||
|
||||
# Split
|
||||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
||||
splits = text_splitter.split_documents(docs)
|
||||
|
||||
# Embed
|
||||
vectorstore = Chroma.from_documents(documents=splits,
|
||||
embedding=OpenAIEmbeddings())
|
||||
|
||||
def load_docs(docs):
|
||||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=10)
|
||||
splits = text_splitter.split_documents(docs)
|
||||
vectorstore.add_documents(documents=splits)
|
||||
|
||||
def load_wikipedia(query):
|
||||
load_docs(WikipediaLoader(query=query, load_max_docs=1).load())
|
||||
|
||||
def load_urls(urls):
|
||||
load_docs(AsyncHtmlLoader(urls).load())
|
||||
|
||||
|
||||
wiki_query = "Kerbel Space Program"
|
||||
print(f"Loading Wikipedia pages on: {wiki_query}")
|
||||
load_wikipedia(wiki_query)
|
||||
|
||||
urls = ["https://wiki.kerbalspaceprogram.com/wiki/Kerbin", "https://wiki.kerbalspaceprogram.com/wiki/Eve"]
|
||||
print(f"Loading: {urls}")
|
||||
load_urls(urls)
|
||||
|
||||
print("RAG database initialized with the following sources.")
|
||||
retriever = vectorstore.as_retriever()
|
||||
document_data_sources = set()
|
||||
for doc_metadata in retriever.vectorstore.get()['metadatas']:
|
||||
document_data_sources.add(doc_metadata['source'])
|
||||
for doc in document_data_sources:
|
||||
print(f" {doc}")
|
Reference in New Issue
Block a user