mirror of
https://git.adityakumar.xyz/design-project.git/
synced 2025-02-22 12:50:01 +00:00
initial commit
This commit is contained in:
commit
93ee9c739c
5 changed files with 474 additions and 0 deletions
182
.gitignore
vendored
Normal file
182
.gitignore
vendored
Normal file
|
@ -0,0 +1,182 @@
|
|||
# Created by https://www.toptal.com/developers/gitignore/api/python
|
||||
# Edit at https://www.toptal.com/developers/gitignore?templates=python
|
||||
|
||||
|
||||
### Python ###
|
||||
# CUSTOM
|
||||
*.sqlite3
|
||||
*.bin
|
||||
demo-rag-chroma/
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
### Python Patch ###
|
||||
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
|
||||
poetry.toml
|
||||
|
||||
# ruff
|
||||
.ruff_cache/
|
||||
|
||||
# LSP config files
|
||||
pyrightconfig.json
|
||||
|
||||
# End of https://www.toptal.com/developers/gitignore/api/python
|
34
Makefile
Normal file
34
Makefile
Normal file
|
@ -0,0 +1,34 @@
|
|||
SHELL :=/bin/bash
|
||||
|
||||
.PHONY: clean check setup
|
||||
.DEFAULT_GOAL=help
|
||||
VENV_DIR = .venv
|
||||
PYTHON_VERSION = python3.11
|
||||
|
||||
check: # Ruff check
|
||||
@ruff check .
|
||||
@echo "✅ Check complete!"
|
||||
|
||||
fix: # Fix auto-fixable linting issues
|
||||
@ruff check app.py --fix
|
||||
|
||||
clean: # Clean temporary files
|
||||
@rm -rf __pycache__ .pytest_cache
|
||||
@find . -name '*.pyc' -exec rm -r {} +
|
||||
@find . -name '__pycache__' -exec rm -r {} +
|
||||
@rm -rf build dist
|
||||
@find . -name '*.egg-info' -type d -exec rm -r {} +
|
||||
|
||||
run: # Run the application
|
||||
@streamlit run app.py
|
||||
|
||||
setup: # Initial project setup
|
||||
@echo "Creating virtual env at: $(VENV_DIR)"s
|
||||
@$(PYTHON_VERSION) -m venv $(VENV_DIR)
|
||||
@echo "Installing dependencies..."
|
||||
@source $(VENV_DIR)/bin/activate && pip install -r requirements/requirements-dev.txt && pip install -r requirements/requirements.txt
|
||||
@echo -e "\n✅ Done.\n🎉 Run the following commands to get started:\n\n ➡️ source $(VENV_DIR)/bin/activate\n ➡️ make run\n"
|
||||
|
||||
|
||||
help: # Show this help
|
||||
@egrep -h '\s#\s' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?# "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
|
250
app.py
Normal file
250
app.py
Normal file
|
@ -0,0 +1,250 @@
|
|||
import os
|
||||
import tempfile
|
||||
|
||||
import chromadb
|
||||
import ollama
|
||||
import streamlit as st
|
||||
from chromadb.utils.embedding_functions.ollama_embedding_function import (
|
||||
OllamaEmbeddingFunction,
|
||||
)
|
||||
from langchain_community.document_loaders import PyMuPDFLoader
|
||||
from langchain_core.documents import Document
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
from sentence_transformers import CrossEncoder
|
||||
from streamlit.runtime.uploaded_file_manager import UploadedFile
|
||||
|
||||
system_prompt = """
|
||||
You are an AI assistant tasked with providing detailed answers based solely on the given context. Your goal is to analyze the information provided and formulate a comprehensive, well-structured response to the question.
|
||||
|
||||
context will be passed as "Context:"
|
||||
user question will be passed as "Question:"
|
||||
|
||||
To answer the question:
|
||||
1. Thoroughly analyze the context, identifying key information relevant to the question.
|
||||
2. Organize your thoughts and plan your response to ensure a logical flow of information.
|
||||
3. Formulate a detailed answer that directly addresses the question, using only the information provided in the context.
|
||||
4. Ensure your answer is comprehensive, covering all relevant aspects found in the context.
|
||||
5. If the context doesn't contain sufficient information to fully answer the question, state this clearly in your response.
|
||||
|
||||
Format your response as follows:
|
||||
1. Use clear, concise language.
|
||||
2. Organize your answer into paragraphs for readability.
|
||||
3. Use bullet points or numbered lists where appropriate to break down complex information.
|
||||
4. If relevant, include any headings or subheadings to structure your response.
|
||||
5. Ensure proper grammar, punctuation, and spelling throughout your answer.
|
||||
|
||||
Important: Base your entire response solely on the information provided in the context. Do not include any external knowledge or assumptions not present in the given text.
|
||||
"""
|
||||
|
||||
|
||||
def process_document(uploaded_file: UploadedFile) -> list[Document]:
|
||||
"""Processes an uploaded PDF file by converting it to text chunks.
|
||||
|
||||
Takes an uploaded PDF file, saves it temporarily, loads and splits the content
|
||||
into text chunks using recursive character splitting.
|
||||
|
||||
Args:
|
||||
uploaded_file: A Streamlit UploadedFile object containing the PDF file
|
||||
|
||||
Returns:
|
||||
A list of Document objects containing the chunked text from the PDF
|
||||
|
||||
Raises:
|
||||
IOError: If there are issues reading/writing the temporary file
|
||||
"""
|
||||
# Store uploaded file as a temp file
|
||||
temp_file = tempfile.NamedTemporaryFile("wb", suffix=".pdf", delete=False)
|
||||
temp_file.write(uploaded_file.read())
|
||||
|
||||
loader = PyMuPDFLoader(temp_file.name)
|
||||
docs = loader.load()
|
||||
os.unlink(temp_file.name) # Delete temp file
|
||||
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=400,
|
||||
chunk_overlap=100,
|
||||
separators=["\n\n", "\n", ".", "?", "!", " ", ""],
|
||||
)
|
||||
return text_splitter.split_documents(docs)
|
||||
|
||||
|
||||
def get_vector_collection() -> chromadb.Collection:
|
||||
"""Gets or creates a ChromaDB collection for vector storage.
|
||||
|
||||
Creates an Ollama embedding function using the nomic-embed-text model and initializes
|
||||
a persistent ChromaDB client. Returns a collection that can be used to store and
|
||||
query document embeddings.
|
||||
|
||||
Returns:
|
||||
chromadb.Collection: A ChromaDB collection configured with the Ollama embedding
|
||||
function and cosine similarity space.
|
||||
"""
|
||||
ollama_ef = OllamaEmbeddingFunction(
|
||||
url="http://localhost:11434/api/embeddings",
|
||||
model_name="nomic-embed-text:latest",
|
||||
)
|
||||
|
||||
chroma_client = chromadb.PersistentClient(path="./chromadb")
|
||||
return chroma_client.get_or_create_collection(
|
||||
name="rag_app",
|
||||
embedding_function=ollama_ef,
|
||||
metadata={"hnsw:space": "cosine"},
|
||||
)
|
||||
|
||||
|
||||
def add_to_vector_collection(all_splits: list[Document], file_name: str):
|
||||
"""Adds document splits to a vector collection for semantic search.
|
||||
|
||||
Takes a list of document splits and adds them to a ChromaDB vector collection
|
||||
along with their metadata and unique IDs based on the filename.
|
||||
|
||||
Args:
|
||||
all_splits: List of Document objects containing text chunks and metadata
|
||||
file_name: String identifier used to generate unique IDs for the chunks
|
||||
|
||||
Returns:
|
||||
None. Displays a success message via Streamlit when complete.
|
||||
|
||||
Raises:
|
||||
ChromaDBError: If there are issues upserting documents to the collection
|
||||
"""
|
||||
collection = get_vector_collection()
|
||||
documents, metadatas, ids = [], [], []
|
||||
|
||||
for idx, split in enumerate(all_splits):
|
||||
documents.append(split.page_content)
|
||||
metadatas.append(split.metadata)
|
||||
ids.append(f"{file_name}_{idx}")
|
||||
|
||||
collection.upsert(
|
||||
documents=documents,
|
||||
metadatas=metadatas,
|
||||
ids=ids,
|
||||
)
|
||||
st.success("Data added to the vector store!")
|
||||
|
||||
|
||||
def query_collection(prompt: str, n_results: int = 10):
|
||||
"""Queries the vector collection with a given prompt to retrieve relevant documents.
|
||||
|
||||
Args:
|
||||
prompt: The search query text to find relevant documents.
|
||||
n_results: Maximum number of results to return. Defaults to 10.
|
||||
|
||||
Returns:
|
||||
dict: Query results containing documents, distances and metadata from the collection.
|
||||
|
||||
Raises:
|
||||
ChromaDBError: If there are issues querying the collection.
|
||||
"""
|
||||
collection = get_vector_collection()
|
||||
results = collection.query(query_texts=[prompt], n_results=n_results)
|
||||
return results
|
||||
|
||||
|
||||
def call_llm(context: str, prompt: str):
|
||||
"""Calls the language model with context and prompt to generate a response.
|
||||
|
||||
Uses Ollama to stream responses from a language model by providing context and a
|
||||
question prompt. The model uses a system prompt to format and ground its responses appropriately.
|
||||
|
||||
Args:
|
||||
context: String containing the relevant context for answering the question
|
||||
prompt: String containing the user's question
|
||||
|
||||
Yields:
|
||||
String chunks of the generated response as they become available from the model
|
||||
|
||||
Raises:
|
||||
OllamaError: If there are issues communicating with the Ollama API
|
||||
"""
|
||||
response = ollama.chat(
|
||||
model="granite3-dense:latest",
|
||||
stream=True,
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": system_prompt,
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"Context: {context}, Question: {prompt}",
|
||||
},
|
||||
],
|
||||
)
|
||||
for chunk in response:
|
||||
if chunk["done"] is False:
|
||||
yield chunk["message"]["content"]
|
||||
else:
|
||||
break
|
||||
|
||||
|
||||
def re_rank_cross_encoders(documents: list[str]) -> tuple[str, list[int]]:
|
||||
"""Re-ranks documents using a cross-encoder model for more accurate relevance scoring.
|
||||
|
||||
Uses the MS MARCO MiniLM cross-encoder model to re-rank the input documents based on
|
||||
their relevance to the query prompt. Returns the concatenated text of the top 3 most
|
||||
relevant documents along with their indices.
|
||||
|
||||
Args:
|
||||
documents: List of document strings to be re-ranked.
|
||||
|
||||
Returns:
|
||||
tuple: A tuple containing:
|
||||
- relevant_text (str): Concatenated text from the top 3 ranked documents
|
||||
- relevant_text_ids (list[int]): List of indices for the top ranked documents
|
||||
|
||||
Raises:
|
||||
ValueError: If documents list is empty
|
||||
RuntimeError: If cross-encoder model fails to load or rank documents
|
||||
"""
|
||||
relevant_text = ""
|
||||
relevant_text_ids = []
|
||||
|
||||
encoder_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
|
||||
ranks = encoder_model.rank(prompt, documents, top_k=3)
|
||||
for rank in ranks:
|
||||
relevant_text += documents[rank["corpus_id"]]
|
||||
relevant_text_ids.append(rank["corpus_id"])
|
||||
|
||||
return relevant_text, relevant_text_ids
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Document Upload Area
|
||||
with st.sidebar:
|
||||
st.set_page_config(page_title="RAG Question Answer")
|
||||
uploaded_file = st.file_uploader(
|
||||
"**📑 Upload PDF files for QnA**", type=["pdf"], accept_multiple_files=False
|
||||
)
|
||||
|
||||
process = st.button(
|
||||
"⚡️ Process",
|
||||
)
|
||||
if uploaded_file and process:
|
||||
normalize_uploaded_file_name = uploaded_file.name.translate(
|
||||
str.maketrans({"-": "_", ".": "_", " ": "_"})
|
||||
)
|
||||
all_splits = process_document(uploaded_file)
|
||||
add_to_vector_collection(all_splits, normalize_uploaded_file_name)
|
||||
|
||||
# Question and Answer Area
|
||||
st.header("🗣️ RAG Question Answer")
|
||||
prompt = st.text_area("**Ask a question related to your document:**")
|
||||
ask = st.button(
|
||||
"🔥 Ask",
|
||||
)
|
||||
|
||||
if ask and prompt:
|
||||
results = query_collection(prompt)
|
||||
context = results.get("documents")[0]
|
||||
relevant_text, relevant_text_ids = re_rank_cross_encoders(context)
|
||||
response = call_llm(context=relevant_text, prompt=prompt)
|
||||
st.write_stream(response)
|
||||
|
||||
with st.expander("See retrieved documents"):
|
||||
st.write(results)
|
||||
|
||||
with st.expander("See most relevant document ids"):
|
||||
st.write(relevant_text_ids)
|
||||
st.write(relevant_text)
|
1
requirements/requirements-dev.txt
Normal file
1
requirements/requirements-dev.txt
Normal file
|
@ -0,0 +1 @@
|
|||
ruff==0.7.4
|
7
requirements/requirements.txt
Normal file
7
requirements/requirements.txt
Normal file
|
@ -0,0 +1,7 @@
|
|||
ollama==0.3.3 # Local inference
|
||||
chromadb==0.5.20 # Vector Database
|
||||
sentence-transformers==3.3.1 # CrossEncoder Re-ranking
|
||||
streamlit==1.40.1 # Application UI
|
||||
PyMuPDF==1.24.14 # PDF Document loader
|
||||
langchain-community==0.3.7 # Utils for text splitting
|
||||
|
Loading…
Reference in a new issue