Skip to main content

Overview

Mini RAG uses a clean, configuration-based API that organizes settings into logical groups. This approach provides better organization, easier maintenance, and clearer code.

Configuration Classes

Mini RAG provides four main configuration classes:

LLMConfig

Configure your language model settings

RetrievalConfig

Control retrieval behavior

RerankerConfig

Choose and configure reranking

ObservabilityConfig

Enable monitoring and tracing

LLMConfig

Configure your language model for answer generation.

Basic Configuration

from mini import AgenticRAG, LLMConfig

rag = AgenticRAG(
    vector_store=vector_store,
    embedding_model=embedding_model,
    llm_config=LLMConfig(
        model="gpt-4o-mini"
    )
)

Complete Options

from mini import LLMConfig

llm_config = LLMConfig(
    model="gpt-4o-mini",              # Model name
    api_key=None,                      # API key (defaults to OPENAI_API_KEY env var)
    base_url=None,                     # Base URL (defaults to OPENAI_BASE_URL env var)
    temperature=0.7,                   # Response randomness (0.0-2.0)
    timeout=60.0,                      # Request timeout in seconds
    max_retries=3                      # Number of retry attempts
)

rag = AgenticRAG(
    vector_store=vector_store,
    embedding_model=embedding_model,
    llm_config=llm_config
)

Parameter Reference

ParameterTypeDefaultDescription
modelstr"gpt-4"Model identifier
api_keyOptional[str]NoneAPI key (uses env var if None)
base_urlOptional[str]NoneCustom API endpoint
temperaturefloat0.7Sampling temperature
timeoutfloat60.0Request timeout
max_retriesint3Retry attempts

Common Configurations

Using OpenAI

llm_config = LLMConfig(
    model="gpt-4o-mini",
    temperature=0.7
)

Using Azure OpenAI

llm_config = LLMConfig(
    model="gpt-4",
    api_key="your-azure-key",
    base_url="https://your-resource.openai.azure.com/openai/deployments/your-deployment",
    temperature=0.7
)

Using Compatible API

# For OpenAI-compatible endpoints (e.g., llama.cpp, vLLM)
llm_config = LLMConfig(
    model="mistral-7b",
    api_key="not-needed",
    base_url="http://localhost:8080/v1",
    temperature=0.5
)

RetrievalConfig

Control how documents are retrieved and processed.

Basic Configuration

from mini import RetrievalConfig

rag = AgenticRAG(
    vector_store=vector_store,
    embedding_model=embedding_model,
    retrieval_config=RetrievalConfig(
        top_k=10,
        rerank_top_k=3
    )
)

Complete Options

from mini import RetrievalConfig

retrieval_config = RetrievalConfig(
    top_k=10,                          # Number of chunks to retrieve initially
    rerank_top_k=3,                    # Number of chunks to keep after reranking
    use_query_rewriting=True,          # Enable query rewriting
    use_reranking=True,                # Enable reranking
    use_hybrid_search=False,           # Enable hybrid search (semantic + BM25)
    rrf_k=60                           # RRF constant for hybrid search
)

rag = AgenticRAG(
    vector_store=vector_store,
    embedding_model=embedding_model,
    retrieval_config=retrieval_config
)

Parameter Reference

ParameterTypeDefaultDescription
top_kint5Initial retrieval count
rerank_top_kint3Final chunk count after reranking
use_query_rewritingboolTrueGenerate query variations
use_rerankingboolTrueRerank retrieved chunks
use_hybrid_searchboolFalseCombine semantic + keyword search
rrf_kint60RRF fusion constant

Tuning Guidelines

For Comprehensive Answers

# Retrieve more context, use more in final answer
RetrievalConfig(
    top_k=20,
    rerank_top_k=5,
    use_query_rewriting=True,
    use_reranking=True
)

For Fast, Focused Answers

# Retrieve less, quick responses
RetrievalConfig(
    top_k=5,
    rerank_top_k=2,
    use_query_rewriting=False,
    use_reranking=True
)

For Technical/Keyword Queries

# Use hybrid search for better keyword matching
RetrievalConfig(
    top_k=10,
    rerank_top_k=3,
    use_hybrid_search=True,
    use_query_rewriting=True
)

RerankerConfig

Choose and configure your reranking strategy.

Available Rerankers

Mini RAG supports four reranking strategies:
  1. LLM-based (default): Uses your LLM to score chunks
  2. Cohere: Uses Cohere’s specialized reranking API
  3. Sentence Transformer: Uses local cross-encoder models
  4. None: Disables reranking

LLM-based Reranking

from mini import RerankerConfig

# Default - uses your LLM
rag = AgenticRAG(
    vector_store=vector_store,
    embedding_model=embedding_model,
    reranker_config=RerankerConfig(
        type="llm"
    )
)

Cohere Reranking

import os
from mini import RerankerConfig

rag = AgenticRAG(
    vector_store=vector_store,
    embedding_model=embedding_model,
    reranker_config=RerankerConfig(
        type="cohere",
        kwargs={
            "api_key": os.getenv("COHERE_API_KEY"),
            "model": "rerank-english-v3.0",  # or "rerank-multilingual-v3.0"
            "max_chunks_per_doc": None
        }
    )
)

Sentence Transformer Reranking

from mini import RerankerConfig

rag = AgenticRAG(
    vector_store=vector_store,
    embedding_model=embedding_model,
    reranker_config=RerankerConfig(
        type="sentence-transformer",
        kwargs={
            "model_name": "cross-encoder/ms-marco-MiniLM-L-6-v2",
            "device": "cuda"  # or "cpu"
        }
    )
)

Custom Reranker

from mini import RerankerConfig
from mini.reranker import CohereReranker

# Create custom reranker instance
custom_reranker = CohereReranker(
    api_key="your-key",
    model="rerank-multilingual-v3.0"
)

rag = AgenticRAG(
    vector_store=vector_store,
    embedding_model=embedding_model,
    reranker_config=RerankerConfig(
        custom_reranker=custom_reranker
    )
)

Disable Reranking

from mini import RerankerConfig

rag = AgenticRAG(
    vector_store=vector_store,
    embedding_model=embedding_model,
    reranker_config=RerankerConfig(
        type="none"
    )
)

Comparison

RerankerProsConsBest For
LLM-basedSimple, no extra APIsUses LLM tokensGeneral use
CohereHighest qualityRequires API key, costsProduction quality
Sentence TransformerLocal, private, freeRequires GPU for speedPrivacy-sensitive
NoneFastestLower qualitySpeed-critical

ObservabilityConfig

Enable monitoring and tracing with Langfuse.

Basic Configuration

from mini import ObservabilityConfig

rag = AgenticRAG(
    vector_store=vector_store,
    embedding_model=embedding_model,
    observability_config=ObservabilityConfig(
        enabled=True
    )
)

Complete Options

import os
from mini import ObservabilityConfig

observability_config = ObservabilityConfig(
    enabled=True,
    public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
    secret_key=os.getenv("LANGFUSE_SECRET_KEY"),
    host="https://cloud.langfuse.com"
)

rag = AgenticRAG(
    vector_store=vector_store,
    embedding_model=embedding_model,
    observability_config=observability_config
)

What Gets Tracked

When enabled, Mini RAG tracks:
  • 🔍 Query rewriting operations
  • 📚 Document retrieval metrics
  • 🎯 Reranking performance
  • 💬 LLM generation calls
  • 📄 Document indexing pipeline
  • ⏱️ Latency for each step
  • 🎭 Input/output data

Setup Langfuse

1

Sign Up

Create a free account at Langfuse Cloud
2

Get API Keys

Get your public and secret keys from project settings
3

Set Environment Variables

LANGFUSE_PUBLIC_KEY=pk-lf-...
LANGFUSE_SECRET_KEY=sk-lf-...
LANGFUSE_HOST=https://cloud.langfuse.com
4

Enable in Code

observability_config=ObservabilityConfig(enabled=True)

Embedding Configuration

Configure the embedding model separately if needed.

Basic Configuration

from mini import EmbeddingModel

embedding_model = EmbeddingModel()  # Uses defaults from env vars

Complete Options

from mini import EmbeddingModel

embedding_model = EmbeddingModel(
    api_key="sk-...",                          # API key
    base_url="https://api.openai.com/v1",      # Custom endpoint
    model="text-embedding-3-small",            # Model name
    dimensions=None,                            # Custom dimensions (if supported)
    timeout=60.0,                               # Request timeout
    max_retries=3                               # Retry attempts
)

Provider Examples

OpenAI

embedding_model = EmbeddingModel(
    model="text-embedding-3-small",
    dimensions=1536
)

Azure OpenAI

embedding_model = EmbeddingModel(
    api_key="your-azure-key",
    base_url="https://your-resource.openai.azure.com/openai/deployments/your-deployment",
    model="text-embedding-ada-002"
)

Local Model

# Using llama.cpp or similar
embedding_model = EmbeddingModel(
    api_key="not-needed",
    base_url="http://localhost:8080/v1",
    model="text-embedding"
)

Vector Store Configuration

Configure Milvus vector storage.

Basic Configuration

from mini import VectorStore

vector_store = VectorStore(
    uri=os.getenv("MILVUS_URI"),
    token=os.getenv("MILVUS_TOKEN"),
    collection_name="my_collection",
    dimension=1536
)

Complete Options

from mini import VectorStore

vector_store = VectorStore(
    uri="https://your-instance.com",           # Milvus URI
    token="your-token",                        # Authentication token
    collection_name="documents",               # Collection name
    dimension=1536,                            # Embedding dimension
    metric_type="IP",                          # Similarity metric
    index_type="IVF_FLAT",                    # Index type
    nlist=128                                  # Number of clusters
)

Parameter Reference

ParameterTypeDefaultDescription
uristr-Milvus server URI
tokenstr-Authentication token
collection_namestr-Collection identifier
dimensionint-Embedding vector dimension
metric_typestr"IP"Distance metric (IP, L2, COSINE)
index_typestr"IVF_FLAT"Index algorithm
nlistint128Number of cluster units

Metric Types

  • IP (Inner Product): Fast, recommended for normalized vectors (cosine similarity)
  • L2: Euclidean distance
  • COSINE: Direct cosine similarity

Index Types

  • IVF_FLAT: Good balance of speed and accuracy
  • IVF_SQ8: Faster, uses less memory
  • HNSW: Highest accuracy, more memory

Full Configuration Example

Putting it all together:
import os
from mini import (
    AgenticRAG,
    LLMConfig,
    RetrievalConfig,
    RerankerConfig,
    ObservabilityConfig,
    EmbeddingModel,
    VectorStore
)
from dotenv import load_dotenv

load_dotenv()

# Embedding model
embedding_model = EmbeddingModel(
    model="text-embedding-3-small",
    timeout=60.0
)

# Vector store
vector_store = VectorStore(
    uri=os.getenv("MILVUS_URI"),
    token=os.getenv("MILVUS_TOKEN"),
    collection_name="production_docs",
    dimension=1536,
    metric_type="IP",
    index_type="IVF_FLAT"
)

# RAG with all configs
rag = AgenticRAG(
    vector_store=vector_store,
    embedding_model=embedding_model,
    llm_config=LLMConfig(
        model="gpt-4o-mini",
        temperature=0.7,
        timeout=120.0,
        max_retries=3
    ),
    retrieval_config=RetrievalConfig(
        top_k=10,
        rerank_top_k=3,
        use_query_rewriting=True,
        use_reranking=True,
        use_hybrid_search=True
    ),
    reranker_config=RerankerConfig(
        type="cohere",
        kwargs={
            "api_key": os.getenv("COHERE_API_KEY"),
            "model": "rerank-english-v3.0"
        }
    ),
    observability_config=ObservabilityConfig(
        enabled=True,
        public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
        secret_key=os.getenv("LANGFUSE_SECRET_KEY")
    )
)

Environment Variables

Recommended .env file structure:
# OpenAI
OPENAI_API_KEY=sk-your-key
OPENAI_BASE_URL=https://api.openai.com/v1
EMBEDDING_MODEL=text-embedding-3-small

# Milvus
MILVUS_URI=https://your-instance.com
MILVUS_TOKEN=your-token

# Cohere (optional)
COHERE_API_KEY=your-cohere-key

# Langfuse (optional)
LANGFUSE_PUBLIC_KEY=pk-lf-...
LANGFUSE_SECRET_KEY=sk-lf-...
LANGFUSE_HOST=https://cloud.langfuse.com

Next Steps