Examples
PDF Knowledge Base
Build a searchable knowledge base from a folder of PDF documents.
PDFs are everywhere—contracts, reports, manuals, research papers. This example shows how to build a searchable knowledge base from a folder of PDFs using Unrag's LLM extraction.
What you'll build
A complete PDF ingestion pipeline that:
- Scans a directory for PDF files
- Extracts text from each PDF using Gemini
- Chunks and embeds the extracted text
- Provides a search interface
Prerequisites
- Unrag installed and configured
- A folder of PDFs to index
- API access to Gemini (via Vercel AI Gateway)
Project structure
my-pdf-knowledge-base/
├── documents/ # Your PDFs go here
│ ├── contract-2024.pdf
│ ├── user-manual.pdf
│ └── quarterly-report.pdf
├── scripts/
│ ├── ingest.ts # Ingest all PDFs
│ └── search.ts # Search interface
├── lib/
│ └── unrag/ # Generated Unrag files
├── unrag.config.ts
└── package.jsonConfiguration
Configure Unrag with PDF extraction enabled:
// unrag.config.ts
import { defineUnragConfig } from "./lib/unrag/core";
import { createDrizzleVectorStore } from "./lib/unrag/store/drizzle";
import { db } from "./lib/db";
export const unrag = defineUnragConfig({
defaults: {
chunking: {
chunkSize: 1000,
chunkOverlap: 100,
},
retrieval: {
topK: 8,
},
},
embedding: {
provider: "ai",
config: {
type: "text",
model: "openai/text-embedding-3-small",
},
},
engine: {
assetProcessing: {
onError: "skip", // Continue if a PDF fails
pdf: {
llmExtraction: {
enabled: true,
model: "google/gemini-2.0-flash",
timeoutMs: 90_000, // 90s for large PDFs
maxBytes: 20 * 1024 * 1024, // 20 MB limit
maxOutputChars: 300_000,
},
},
},
},
} as const);
export function createUnragEngine() {
const store = createDrizzleVectorStore(db);
return unrag.createEngine({ store });
}Ingest script
// scripts/ingest.ts
import { createUnragEngine } from "../unrag.config";
import { readdir, readFile, stat } from "fs/promises";
import path from "path";
const PDF_DIR = path.join(process.cwd(), "documents");
async function getPdfFiles(dir: string): Promise<string[]> {
const entries = await readdir(dir, { withFileTypes: true });
const files: string[] = [];
for (const entry of entries) {
const fullPath = path.join(dir, entry.name);
if (entry.isDirectory()) {
files.push(...(await getPdfFiles(fullPath)));
} else if (entry.name.toLowerCase().endsWith(".pdf")) {
files.push(fullPath);
}
}
return files;
}
async function main() {
const engine = createUnragEngine();
const pdfFiles = await getPdfFiles(PDF_DIR);
console.log(`Found ${pdfFiles.length} PDFs to ingest\n`);
let successCount = 0;
let errorCount = 0;
for (const filePath of pdfFiles) {
const filename = path.basename(filePath);
const relativePath = path.relative(PDF_DIR, filePath);
const stats = await stat(filePath);
// Create a stable sourceId from the path
const sourceId = `pdf:${relativePath.replace(/\\/g, "/")}`;
console.log(`Processing: ${relativePath} (${formatBytes(stats.size)})`);
try {
const bytes = await readFile(filePath);
const result = await engine.ingest({
sourceId,
content: "", // No text content, just the PDF asset
metadata: {
filename,
path: relativePath,
size: stats.size,
ingestedAt: new Date().toISOString(),
},
assets: [
{
assetId: "main",
kind: "pdf",
data: {
kind: "bytes",
bytes: new Uint8Array(bytes),
mediaType: "application/pdf",
filename,
},
},
],
});
console.log(` ✓ Created ${result.chunkCount} chunks\n`);
successCount++;
} catch (error) {
console.error(` ✗ Error: ${error.message}\n`);
errorCount++;
}
}
console.log(`\nComplete: ${successCount} succeeded, ${errorCount} failed`);
}
function formatBytes(bytes: number): string {
if (bytes < 1024) return `${bytes} B`;
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
}
main().catch(console.error);Run with:
npx tsx scripts/ingest.tsSearch script
// scripts/search.ts
import { createUnragEngine } from "../unrag.config";
import readline from "readline";
async function main() {
const engine = createUnragEngine();
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
});
console.log("PDF Knowledge Base Search");
console.log("Type your query, or 'exit' to quit.\n");
const prompt = () => {
rl.question("Query: ", async (query) => {
if (query.toLowerCase() === "exit") {
rl.close();
return;
}
if (!query.trim()) {
prompt();
return;
}
try {
const result = await engine.retrieve({
query,
topK: 5,
});
console.log(`\nFound ${result.chunks.length} results:\n`);
for (let i = 0; i < result.chunks.length; i++) {
const chunk = result.chunks[i];
const score = result.scores?.[i]?.toFixed(3) || "N/A";
const file = chunk.metadata.filename || "Unknown";
console.log(`[${i + 1}] ${file} (score: ${score})`);
console.log(` ${chunk.content.slice(0, 200).replace(/\n/g, " ")}...`);
console.log();
}
} catch (error) {
console.error(`Error: ${error.message}`);
}
prompt();
});
};
prompt();
}
main().catch(console.error);Run with:
npx tsx scripts/search.tsAdding a web interface
Create an API endpoint for search:
// app/api/search/route.ts (Next.js)
import { createUnragEngine } from "@/unrag.config";
import { NextRequest, NextResponse } from "next/server";
export async function GET(req: NextRequest) {
const query = req.nextUrl.searchParams.get("q");
if (!query) {
return NextResponse.json({ error: "Missing query" }, { status: 400 });
}
const engine = createUnragEngine();
const result = await engine.retrieve({
query,
topK: 10,
});
return NextResponse.json({
query,
results: result.chunks.map((chunk, i) => ({
content: chunk.content,
score: result.scores?.[i],
file: chunk.metadata.filename,
path: chunk.metadata.path,
})),
});
}Handling large PDFs
For very large PDFs, consider:
1. Increase timeouts
assetProcessing: {
pdf: {
llmExtraction: {
timeoutMs: 180_000, // 3 minutes
},
},
},2. Process in batches
const BATCH_SIZE = 5;
for (let i = 0; i < pdfFiles.length; i += BATCH_SIZE) {
const batch = pdfFiles.slice(i, i + BATCH_SIZE);
await Promise.all(
batch.map((file) => ingestPdf(engine, file))
);
console.log(`Processed ${Math.min(i + BATCH_SIZE, pdfFiles.length)}/${pdfFiles.length}`);
}3. Skip very large files
const MAX_SIZE = 25 * 1024 * 1024; // 25 MB
for (const filePath of pdfFiles) {
const stats = await stat(filePath);
if (stats.size > MAX_SIZE) {
console.log(`Skipping ${filePath} (too large)`);
continue;
}
// Process...
}Incremental updates
Track which PDFs have been ingested to avoid reprocessing:
import { readFile, writeFile } from "fs/promises";
const MANIFEST_PATH = "./pdf-manifest.json";
type Manifest = Record<string, { mtime: number; chunks: number }>;
async function loadManifest(): Promise<Manifest> {
try {
const data = await readFile(MANIFEST_PATH, "utf-8");
return JSON.parse(data);
} catch {
return {};
}
}
async function saveManifest(manifest: Manifest) {
await writeFile(MANIFEST_PATH, JSON.stringify(manifest, null, 2));
}
async function main() {
const engine = createUnragEngine();
const manifest = await loadManifest();
const pdfFiles = await getPdfFiles(PDF_DIR);
for (const filePath of pdfFiles) {
const stats = await stat(filePath);
const existing = manifest[filePath];
// Skip if file hasn't changed
if (existing && existing.mtime === stats.mtimeMs) {
console.log(`Skipping ${filePath} (unchanged)`);
continue;
}
const result = await ingestPdf(engine, filePath);
manifest[filePath] = {
mtime: stats.mtimeMs,
chunks: result.chunkCount,
};
}
await saveManifest(manifest);
}Custom extraction prompts
Tailor the extraction prompt for your document type:
// For legal contracts
assetProcessing: {
pdf: {
llmExtraction: {
enabled: true,
prompt: `
Extract all text from this legal document. Pay special attention to:
- Section and clause numbers
- Defined terms (preserve exact capitalization)
- Dates and monetary amounts
- Signature blocks
Preserve the document structure as markdown.
`.trim(),
},
},
},
// For technical manuals
assetProcessing: {
pdf: {
llmExtraction: {
enabled: true,
prompt: `
Extract all text from this technical document. Include:
- All headings and subheadings
- Numbered steps and procedures
- Table contents (format as markdown tables)
- Figure captions
Preserve technical terminology exactly.
`.trim(),
},
},
},