mirror of
https://git.adityakumar.xyz/design-project.git/
synced 2025-02-22 12:50:01 +00:00
initial commit
This commit is contained in:
commit
93ee9c739c
5 changed files with 474 additions and 0 deletions
182
.gitignore
vendored
Normal file
182
.gitignore
vendored
Normal file
|
@ -0,0 +1,182 @@
|
||||||
|
# Created by https://www.toptal.com/developers/gitignore/api/python
|
||||||
|
# Edit at https://www.toptal.com/developers/gitignore?templates=python
|
||||||
|
|
||||||
|
|
||||||
|
### Python ###
|
||||||
|
# CUSTOM
|
||||||
|
*.sqlite3
|
||||||
|
*.bin
|
||||||
|
demo-rag-chroma/
|
||||||
|
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||||
|
#poetry.lock
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||||
|
#pdm.lock
|
||||||
|
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||||
|
# in version control.
|
||||||
|
# https://pdm.fming.dev/#use-with-ide
|
||||||
|
.pdm.toml
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||||
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
|
#.idea/
|
||||||
|
|
||||||
|
### Python Patch ###
|
||||||
|
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
|
||||||
|
poetry.toml
|
||||||
|
|
||||||
|
# ruff
|
||||||
|
.ruff_cache/
|
||||||
|
|
||||||
|
# LSP config files
|
||||||
|
pyrightconfig.json
|
||||||
|
|
||||||
|
# End of https://www.toptal.com/developers/gitignore/api/python
|
34
Makefile
Normal file
34
Makefile
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
SHELL :=/bin/bash
|
||||||
|
|
||||||
|
.PHONY: clean check setup
|
||||||
|
.DEFAULT_GOAL=help
|
||||||
|
VENV_DIR = .venv
|
||||||
|
PYTHON_VERSION = python3.11
|
||||||
|
|
||||||
|
check: # Ruff check
|
||||||
|
@ruff check .
|
||||||
|
@echo "✅ Check complete!"
|
||||||
|
|
||||||
|
fix: # Fix auto-fixable linting issues
|
||||||
|
@ruff check app.py --fix
|
||||||
|
|
||||||
|
clean: # Clean temporary files
|
||||||
|
@rm -rf __pycache__ .pytest_cache
|
||||||
|
@find . -name '*.pyc' -exec rm -r {} +
|
||||||
|
@find . -name '__pycache__' -exec rm -r {} +
|
||||||
|
@rm -rf build dist
|
||||||
|
@find . -name '*.egg-info' -type d -exec rm -r {} +
|
||||||
|
|
||||||
|
run: # Run the application
|
||||||
|
@streamlit run app.py
|
||||||
|
|
||||||
|
setup: # Initial project setup
|
||||||
|
@echo "Creating virtual env at: $(VENV_DIR)"s
|
||||||
|
@$(PYTHON_VERSION) -m venv $(VENV_DIR)
|
||||||
|
@echo "Installing dependencies..."
|
||||||
|
@source $(VENV_DIR)/bin/activate && pip install -r requirements/requirements-dev.txt && pip install -r requirements/requirements.txt
|
||||||
|
@echo -e "\n✅ Done.\n🎉 Run the following commands to get started:\n\n ➡️ source $(VENV_DIR)/bin/activate\n ➡️ make run\n"
|
||||||
|
|
||||||
|
|
||||||
|
help: # Show this help
|
||||||
|
@egrep -h '\s#\s' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?# "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
|
250
app.py
Normal file
250
app.py
Normal file
|
@ -0,0 +1,250 @@
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
import chromadb
|
||||||
|
import ollama
|
||||||
|
import streamlit as st
|
||||||
|
from chromadb.utils.embedding_functions.ollama_embedding_function import (
|
||||||
|
OllamaEmbeddingFunction,
|
||||||
|
)
|
||||||
|
from langchain_community.document_loaders import PyMuPDFLoader
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||||
|
from sentence_transformers import CrossEncoder
|
||||||
|
from streamlit.runtime.uploaded_file_manager import UploadedFile
|
||||||
|
|
||||||
|
system_prompt = """
|
||||||
|
You are an AI assistant tasked with providing detailed answers based solely on the given context. Your goal is to analyze the information provided and formulate a comprehensive, well-structured response to the question.
|
||||||
|
|
||||||
|
context will be passed as "Context:"
|
||||||
|
user question will be passed as "Question:"
|
||||||
|
|
||||||
|
To answer the question:
|
||||||
|
1. Thoroughly analyze the context, identifying key information relevant to the question.
|
||||||
|
2. Organize your thoughts and plan your response to ensure a logical flow of information.
|
||||||
|
3. Formulate a detailed answer that directly addresses the question, using only the information provided in the context.
|
||||||
|
4. Ensure your answer is comprehensive, covering all relevant aspects found in the context.
|
||||||
|
5. If the context doesn't contain sufficient information to fully answer the question, state this clearly in your response.
|
||||||
|
|
||||||
|
Format your response as follows:
|
||||||
|
1. Use clear, concise language.
|
||||||
|
2. Organize your answer into paragraphs for readability.
|
||||||
|
3. Use bullet points or numbered lists where appropriate to break down complex information.
|
||||||
|
4. If relevant, include any headings or subheadings to structure your response.
|
||||||
|
5. Ensure proper grammar, punctuation, and spelling throughout your answer.
|
||||||
|
|
||||||
|
Important: Base your entire response solely on the information provided in the context. Do not include any external knowledge or assumptions not present in the given text.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def process_document(uploaded_file: UploadedFile) -> list[Document]:
|
||||||
|
"""Processes an uploaded PDF file by converting it to text chunks.
|
||||||
|
|
||||||
|
Takes an uploaded PDF file, saves it temporarily, loads and splits the content
|
||||||
|
into text chunks using recursive character splitting.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
uploaded_file: A Streamlit UploadedFile object containing the PDF file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of Document objects containing the chunked text from the PDF
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
IOError: If there are issues reading/writing the temporary file
|
||||||
|
"""
|
||||||
|
# Store uploaded file as a temp file
|
||||||
|
temp_file = tempfile.NamedTemporaryFile("wb", suffix=".pdf", delete=False)
|
||||||
|
temp_file.write(uploaded_file.read())
|
||||||
|
|
||||||
|
loader = PyMuPDFLoader(temp_file.name)
|
||||||
|
docs = loader.load()
|
||||||
|
os.unlink(temp_file.name) # Delete temp file
|
||||||
|
|
||||||
|
text_splitter = RecursiveCharacterTextSplitter(
|
||||||
|
chunk_size=400,
|
||||||
|
chunk_overlap=100,
|
||||||
|
separators=["\n\n", "\n", ".", "?", "!", " ", ""],
|
||||||
|
)
|
||||||
|
return text_splitter.split_documents(docs)
|
||||||
|
|
||||||
|
|
||||||
|
def get_vector_collection() -> chromadb.Collection:
|
||||||
|
"""Gets or creates a ChromaDB collection for vector storage.
|
||||||
|
|
||||||
|
Creates an Ollama embedding function using the nomic-embed-text model and initializes
|
||||||
|
a persistent ChromaDB client. Returns a collection that can be used to store and
|
||||||
|
query document embeddings.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
chromadb.Collection: A ChromaDB collection configured with the Ollama embedding
|
||||||
|
function and cosine similarity space.
|
||||||
|
"""
|
||||||
|
ollama_ef = OllamaEmbeddingFunction(
|
||||||
|
url="http://localhost:11434/api/embeddings",
|
||||||
|
model_name="nomic-embed-text:latest",
|
||||||
|
)
|
||||||
|
|
||||||
|
chroma_client = chromadb.PersistentClient(path="./chromadb")
|
||||||
|
return chroma_client.get_or_create_collection(
|
||||||
|
name="rag_app",
|
||||||
|
embedding_function=ollama_ef,
|
||||||
|
metadata={"hnsw:space": "cosine"},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def add_to_vector_collection(all_splits: list[Document], file_name: str):
|
||||||
|
"""Adds document splits to a vector collection for semantic search.
|
||||||
|
|
||||||
|
Takes a list of document splits and adds them to a ChromaDB vector collection
|
||||||
|
along with their metadata and unique IDs based on the filename.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
all_splits: List of Document objects containing text chunks and metadata
|
||||||
|
file_name: String identifier used to generate unique IDs for the chunks
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None. Displays a success message via Streamlit when complete.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ChromaDBError: If there are issues upserting documents to the collection
|
||||||
|
"""
|
||||||
|
collection = get_vector_collection()
|
||||||
|
documents, metadatas, ids = [], [], []
|
||||||
|
|
||||||
|
for idx, split in enumerate(all_splits):
|
||||||
|
documents.append(split.page_content)
|
||||||
|
metadatas.append(split.metadata)
|
||||||
|
ids.append(f"{file_name}_{idx}")
|
||||||
|
|
||||||
|
collection.upsert(
|
||||||
|
documents=documents,
|
||||||
|
metadatas=metadatas,
|
||||||
|
ids=ids,
|
||||||
|
)
|
||||||
|
st.success("Data added to the vector store!")
|
||||||
|
|
||||||
|
|
||||||
|
def query_collection(prompt: str, n_results: int = 10):
|
||||||
|
"""Queries the vector collection with a given prompt to retrieve relevant documents.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prompt: The search query text to find relevant documents.
|
||||||
|
n_results: Maximum number of results to return. Defaults to 10.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Query results containing documents, distances and metadata from the collection.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ChromaDBError: If there are issues querying the collection.
|
||||||
|
"""
|
||||||
|
collection = get_vector_collection()
|
||||||
|
results = collection.query(query_texts=[prompt], n_results=n_results)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def call_llm(context: str, prompt: str):
|
||||||
|
"""Calls the language model with context and prompt to generate a response.
|
||||||
|
|
||||||
|
Uses Ollama to stream responses from a language model by providing context and a
|
||||||
|
question prompt. The model uses a system prompt to format and ground its responses appropriately.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
context: String containing the relevant context for answering the question
|
||||||
|
prompt: String containing the user's question
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
String chunks of the generated response as they become available from the model
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
OllamaError: If there are issues communicating with the Ollama API
|
||||||
|
"""
|
||||||
|
response = ollama.chat(
|
||||||
|
model="granite3-dense:latest",
|
||||||
|
stream=True,
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": system_prompt,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": f"Context: {context}, Question: {prompt}",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
for chunk in response:
|
||||||
|
if chunk["done"] is False:
|
||||||
|
yield chunk["message"]["content"]
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
def re_rank_cross_encoders(documents: list[str]) -> tuple[str, list[int]]:
|
||||||
|
"""Re-ranks documents using a cross-encoder model for more accurate relevance scoring.
|
||||||
|
|
||||||
|
Uses the MS MARCO MiniLM cross-encoder model to re-rank the input documents based on
|
||||||
|
their relevance to the query prompt. Returns the concatenated text of the top 3 most
|
||||||
|
relevant documents along with their indices.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
documents: List of document strings to be re-ranked.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: A tuple containing:
|
||||||
|
- relevant_text (str): Concatenated text from the top 3 ranked documents
|
||||||
|
- relevant_text_ids (list[int]): List of indices for the top ranked documents
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If documents list is empty
|
||||||
|
RuntimeError: If cross-encoder model fails to load or rank documents
|
||||||
|
"""
|
||||||
|
relevant_text = ""
|
||||||
|
relevant_text_ids = []
|
||||||
|
|
||||||
|
encoder_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
|
||||||
|
ranks = encoder_model.rank(prompt, documents, top_k=3)
|
||||||
|
for rank in ranks:
|
||||||
|
relevant_text += documents[rank["corpus_id"]]
|
||||||
|
relevant_text_ids.append(rank["corpus_id"])
|
||||||
|
|
||||||
|
return relevant_text, relevant_text_ids
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Document Upload Area
|
||||||
|
with st.sidebar:
|
||||||
|
st.set_page_config(page_title="RAG Question Answer")
|
||||||
|
uploaded_file = st.file_uploader(
|
||||||
|
"**📑 Upload PDF files for QnA**", type=["pdf"], accept_multiple_files=False
|
||||||
|
)
|
||||||
|
|
||||||
|
process = st.button(
|
||||||
|
"⚡️ Process",
|
||||||
|
)
|
||||||
|
if uploaded_file and process:
|
||||||
|
normalize_uploaded_file_name = uploaded_file.name.translate(
|
||||||
|
str.maketrans({"-": "_", ".": "_", " ": "_"})
|
||||||
|
)
|
||||||
|
all_splits = process_document(uploaded_file)
|
||||||
|
add_to_vector_collection(all_splits, normalize_uploaded_file_name)
|
||||||
|
|
||||||
|
# Question and Answer Area
|
||||||
|
st.header("🗣️ RAG Question Answer")
|
||||||
|
prompt = st.text_area("**Ask a question related to your document:**")
|
||||||
|
ask = st.button(
|
||||||
|
"🔥 Ask",
|
||||||
|
)
|
||||||
|
|
||||||
|
if ask and prompt:
|
||||||
|
results = query_collection(prompt)
|
||||||
|
context = results.get("documents")[0]
|
||||||
|
relevant_text, relevant_text_ids = re_rank_cross_encoders(context)
|
||||||
|
response = call_llm(context=relevant_text, prompt=prompt)
|
||||||
|
st.write_stream(response)
|
||||||
|
|
||||||
|
with st.expander("See retrieved documents"):
|
||||||
|
st.write(results)
|
||||||
|
|
||||||
|
with st.expander("See most relevant document ids"):
|
||||||
|
st.write(relevant_text_ids)
|
||||||
|
st.write(relevant_text)
|
1
requirements/requirements-dev.txt
Normal file
1
requirements/requirements-dev.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
ruff==0.7.4
|
7
requirements/requirements.txt
Normal file
7
requirements/requirements.txt
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
ollama==0.3.3 # Local inference
|
||||||
|
chromadb==0.5.20 # Vector Database
|
||||||
|
sentence-transformers==3.3.1 # CrossEncoder Re-ranking
|
||||||
|
streamlit==1.40.1 # Application UI
|
||||||
|
PyMuPDF==1.24.14 # PDF Document loader
|
||||||
|
langchain-community==0.3.7 # Utils for text splitting
|
||||||
|
|
Loading…
Reference in a new issue