DocumentLoader handles loading documents from various file formats using Microsoft’s MarkItDown library. It supports PDFs, Word documents, images with OCR, and more.
from mini.loader import DocumentLoaderloader = DocumentLoader()# Load a PDFtext = loader.load("document.pdf")print(f"Loaded {len(text)} characters")# Load a Word documenttext = loader.load("document.docx")# Load an image with OCRtext = loader.load("screenshot.png")
from mini.loader import DocumentLoaderloader = DocumentLoader()# Load all documents from a directorytexts = loader.load_documents_from_directory("./documents/")print(f"Loaded {len(texts)} documents from directory")
from mini.loader import DocumentLoaderloader = DocumentLoader()try: text = loader.load("document.pdf")except FileNotFoundError: print("Document not found")except Exception as e: print(f"Error loading document: {e}")
documents = ["doc1.pdf", "doc2.pdf", "doc3.pdf"]loaded = []for doc in documents: try: text = loader.load(doc) loaded.append(text) except Exception as e: print(f"Failed to load {doc}: {e}")
Large Files
For large files, consider chunking during or after loading:
Copy
# Load large documenttext = loader.load("large_document.pdf")# Chunk immediatelychunks = chunker.chunk(text)# Process in batchesbatch_size = 100for i in range(0, len(chunks), batch_size): batch = chunks[i:i + batch_size] # Process batch
Memory Management
Clear large text objects after processing:
Copy
import gctext = loader.load("huge_document.pdf")chunks = chunker.chunk(text)# Clear large text from memorydel textgc.collect()