Initial commit: smart Arabic OCR script with document-aware prompting

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-26 18:31:44 +04:00 · 2026-06-26 18:31:44 +04:00 · 5aec8a5c6c
commit 5aec8a5c6c
4 changed files with 333 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,5 @@
+arabic_ocr_env/
+__pycache__/
+*.pyc
+*.pyo
+*_ocr.txt
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -0,0 +1,51 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Setup
+
+```bash
+# Create and activate the virtualenv
+python3 -m venv arabic_ocr_env
+source arabic_ocr_env/bin/activate
+
+# Install dependencies
+pip install -r requirements.txt
+# System dep required for pdf2image:
+sudo apt-get install -y poppler-utils
+```
+
+## Running
+
+```bash
+source arabic_ocr_env/bin/activate
+
+# Auto-detect document type per page
+python arabic_ocr_smart.py document.pdf [output.txt]
+
+# Force a specific document type
+python arabic_ocr_smart.py scan.pdf --type [handwritten|certificate|id|table|form|mixed]
+
+# Custom Ollama host (default: http://192.168.122.1:11434)
+python arabic_ocr_smart.py scan.pdf --host http://localhost:11434
+```
+
+## Architecture
+
+Single-file script (`arabic_ocr_smart.py`) with no tests or build system.
+
+**Pipeline**: PDF → PIL images (via `pdf2image`/poppler at 300 DPI) → base64 → Ollama `/api/chat` → structured text output.
+
+**Two-pass per page** (when no `--type` forced):
+1. **Detection pass** — sends the `"detect"` prompt to `qwen2.5vl:7b` to classify the page into one of: `handwritten`, `certificate`, `id`, `table`, `form`, `mixed`.
+2. **Extraction pass** — sends the type-specific prompt from the `PROMPTS` dict to extract structured text.
+
+**Key constants** (top of file):
+- `DEFAULT_HOST` — Ollama endpoint (VM bridged network by default)
+- `MODEL` — `qwen2.5vl:7b`
+- `DPI` — render resolution (300)
+- `TIMEOUT` — 300 s per Ollama request
+
+**Prompts** are stored in the `PROMPTS` dict. Arabic prompts are used for Arabic-language outputs (handwritten, certificate, id, form, mixed); the table prompt is English to get Markdown output. Adding a new document type means adding a key to `PROMPTS` and it is automatically available via `--type`.
+
+**Output format**: pages separated by `===` headers, each labeled with page number and detected type. Default output filename is `<input_stem>_ocr.txt` alongside the input PDF.
--- a/arabic_ocr_smart.py
+++ b/arabic_ocr_smart.py
@ -0,0 +1,275 @@
+#!/usr/bin/env python3
+"""
+Smart Arabic OCR — document-aware prompting via qwen2.5vl:7b + Ollama
+=====================================================================
+Supports: handwritten text, certificates, IDs, tables, forms, mixed docs
+
+Usage:
+    python arabic_ocr_smart.py document.pdf [output.txt] [--host 192.168.122.1]
+    python arabic_ocr_smart.py scan.pdf --type table        # force a type
+    python arabic_ocr_smart.py scan.pdf --type certificate
+    python arabic_ocr_smart.py scan.pdf --type id
+    python arabic_ocr_smart.py scan.pdf --type handwritten
+    python arabic_ocr_smart.py scan.pdf --type form
+    python arabic_ocr_smart.py scan.pdf --type mixed
+    python arabic_ocr_smart.py scan.pdf --model qwen2.5vl:7b          # default
+    python arabic_ocr_smart.py scan.pdf --model llava:13b              # any Ollama model
+
+Install:
+    pip install pdf2image pillow
+    sudo apt-get install -y poppler-utils
+"""
+
+import sys
+import base64
+import json
+import io
+import argparse
+import urllib.request
+import urllib.error
+from pathlib import Path
+from pdf2image import convert_from_path
+from PIL import Image
+
+# ── Config ────────────────────────────────────────────────────────────────────
+
+DEFAULT_HOST  = "http://192.168.122.1:11434"
+DEFAULT_MODEL = "qwen2.5vl:7b"
+DPI           = 300
+TIMEOUT       = 300   # seconds per page
+
+# ── Prompts per document type ─────────────────────────────────────────────────
+
+PROMPTS = {
+
+    # ── Auto-detect: first pass to classify the page
+    "detect": (
+        "Look at this document image carefully. "
+        "Classify it as EXACTLY ONE of these types:\n"
+        "  - handwritten   (Arabic handwriting, notes, letters)\n"
+        "  - certificate   (official certificates, diplomas, awards)\n"
+        "  - id            (national ID, passport, driving licence, any ID card)\n"
+        "  - table         (data tables, spreadsheets, structured grids)\n"
+        "  - form          (forms with fields, checkboxes, blanks to fill)\n"
+        "  - mixed         (combination of the above, or printed Arabic text)\n\n"
+        "Reply with ONLY the single word label. No explanation."
+    ),
+
+    # ── Handwritten Arabic
+    "handwritten": (
+        "أنت نظام OCR متخصص في قراءة الخط العربي اليدوي. "
+        "اقرأ كل النص العربي المكتوب بخط اليد في هذه الصورة بدقة تامة. "
+        "حافظ على ترتيب الأسطر كما تظهر في الصورة من اليمين إلى اليسار. "
+        "أبقِ التشكيل (الحركات) إن وجد. "
+        "إذا كان بعض الكلام غير واضح، اكتب [غير واضح] بدلاً منه. "
+        "لا تضف أي تعليق أو وصف — فقط النص المستخرج."
+    ),
+
+    # ── Official certificates / diplomas
+    "certificate": (
+        "أنت نظام OCR متخصص في استخراج بيانات الشهادات الرسمية. "
+        "استخرج من هذه الشهادة المعلومات التالية بتنسيق منظم:\n\n"
+        "نوع الشهادة: ...\n"
+        "اسم الجهة المانحة: ...\n"
+        "اسم صاحب الشهادة: ...\n"
+        "التخصص أو المادة: ...\n"
+        "التاريخ: ...\n"
+        "رقم الشهادة (إن وجد): ...\n"
+        "التقدير أو الدرجة (إن وجد): ...\n"
+        "أي نص إضافي مهم: ...\n\n"
+        "إذا لم تجد معلومة ما، اكتب 'غير متوفر'. "
+        "لا تضف تعليقات خارج هذا التنسيق."
+    ),
+
+    # ── ID cards / passports / licences
+    "id": (
+        "أنت نظام OCR متخصص في استخراج بيانات وثائق الهوية. "
+        "استخرج من هذه الوثيقة المعلومات التالية بتنسيق منظم:\n\n"
+        "نوع الوثيقة: ...\n"
+        "الاسم الكامل: ...\n"
+        "رقم الوثيقة: ...\n"
+        "تاريخ الميلاد: ...\n"
+        "الجنسية: ...\n"
+        "الجنس: ...\n"
+        "تاريخ الإصدار: ...\n"
+        "تاريخ الانتهاء: ...\n"
+        "العنوان (إن وجد): ...\n"
+        "أي بيانات أخرى مرئية: ...\n\n"
+        "إذا لم تجد معلومة ما، اكتب 'غير متوفر'. "
+        "اقرأ أيضاً المنطقة القابلة للقراءة الآلية (MRZ) في أسفل الوثيقة إن وجدت. "
+        "لا تضف تعليقات خارج هذا التنسيق."
+    ),
+
+    # ── Tables / grids
+    "table": (
+        "You are an OCR system specializing in Arabic table extraction. "
+        "Extract the complete table from this image and reproduce it in "
+        "Markdown table format, preserving all rows and columns exactly. "
+        "Arabic text should remain in Arabic (right-to-left content). "
+        "If a cell is empty, leave it blank in the table. "
+        "If there are multiple tables, separate them with a blank line and "
+        "label them Table 1, Table 2, etc. "
+        "Output ONLY the Markdown table(s), no other commentary."
+    ),
+
+    # ── Forms with fields
+    "form": (
+        "أنت نظام OCR متخصص في استخراج بيانات النماذج والاستمارات. "
+        "استخرج جميع حقول هذا النموذج ومحتوياتها بالتنسيق التالي:\n\n"
+        "اسم الحقل: القيمة المدخلة\n\n"
+        "إذا كان الحقل فارغاً اكتب: اسم الحقل: [فارغ]\n"
+        "إذا كان هناك مربعات اختيار، حدد المُختارة بـ ✓ والفارغة بـ ☐\n"
+        "حافظ على الترتيب المنطقي للحقول كما تظهر في النموذج. "
+        "لا تضف أي تعليق خارج هذا التنسيق."
+    ),
+
+    # ── Mixed / general printed Arabic
+    "mixed": (
+        "أنت نظام OCR متخصص في اللغة العربية. "
+        "هذه الصورة تحتوي على مزيج من أنواع المحتوى (نص مطبوع، جداول، صور، عناوين). "
+        "استخرج كل النص العربي والمعلومات المرئية بالترتيب الطبيعي للقراءة. "
+        "للجداول: أعد إنتاجها بتنسيق Markdown.\n"
+        "للعناوين: ضعها على سطر منفصل مع نجمتين ** حولها.\n"
+        "للنص العادي: اكتبه كما هو سطراً سطراً.\n"
+        "حافظ على اتجاه النص من اليمين إلى اليسار. "
+        "لا تضف تعليقات خارج المحتوى المستخرج."
+    ),
+}
+
+VALID_TYPES = [t for t in PROMPTS if t != "detect"]
+
+# ── Helpers ───────────────────────────────────────────────────────────────────
+
+def image_to_base64(pil_image: Image.Image) -> str:
+    buf = io.BytesIO()
+    pil_image.save(buf, format="PNG")
+    return base64.b64encode(buf.getvalue()).decode()
+
+
+def call_ollama(host: str, model: str, prompt: str, pil_image: Image.Image) -> str:
+    payload = {
+        "model": model,
+        "messages": [
+            {
+                "role": "user",
+                "content": prompt,
+                "images": [image_to_base64(pil_image)],
+            }
+        ],
+        "stream": False,
+    }
+    data = json.dumps(payload).encode()
+    req  = urllib.request.Request(
+        f"{host}/api/chat",
+        data=data,
+        headers={"Content-Type": "application/json"},
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
+            result = json.loads(resp.read())
+        return result["message"]["content"].strip()
+    except urllib.error.URLError as e:
+        raise ConnectionError(
+            f"Cannot reach Ollama at {host}. "
+            f"Is it running? (OLLAMA_HOST=0.0.0.0:11434 ollama serve)\n{e}"
+        )
+
+
+def detect_type(host: str, model: str, pil_image: Image.Image) -> str:
+    """Ask the model to classify the document, return one of VALID_TYPES."""
+    raw = call_ollama(host, model, PROMPTS["detect"], pil_image).lower().strip()
+    # Accept the first word that matches a known type
+    for word in raw.split():
+        word = word.strip(".,;:'\"")
+        if word in VALID_TYPES:
+            return word
+    # Fallback
+    return "mixed"
+
+
+def ocr_page(
+    host: str,
+    model: str,
+    pil_image: Image.Image,
+    page_num: int,
+    forced_type: str | None,
+) -> tuple[str, str]:
+    """
+    Returns (doc_type, extracted_text).
+    If forced_type is given, skips detection.
+    """
+    if forced_type:
+        doc_type = forced_type
+        print(f"  Page {page_num}: type forced → {doc_type}", flush=True)
+    else:
+        print(f"  Page {page_num}: detecting type...", end=" ", flush=True)
+        doc_type = detect_type(host, model, pil_image)
+        print(f"→ {doc_type}", flush=True)
+
+    print(f"  Page {page_num}: extracting text...", end=" ", flush=True)
+    text = call_ollama(host, model, PROMPTS[doc_type], pil_image)
+    print("done.", flush=True)
+    return doc_type, text
+
+
+# ── Main ──────────────────────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Smart Arabic OCR with document-aware prompting.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=(
+            "Document types: " + ", ".join(VALID_TYPES) + "\n"
+            "If --type is omitted, each page is auto-detected."
+        ),
+    )
+    parser.add_argument("input",  help="Input PDF file")
+    parser.add_argument("output", nargs="?", help="Output .txt file (optional)")
+    parser.add_argument(
+        "--type", choices=VALID_TYPES, default=None,
+        help="Force a document type for all pages (skips auto-detection)"
+    )
+    parser.add_argument(
+        "--host", default=DEFAULT_HOST,
+        help=f"Ollama host URL (default: {DEFAULT_HOST})"
+    )
+    parser.add_argument(
+        "--model", default=DEFAULT_MODEL,
+        help=f"Ollama model name (default: {DEFAULT_MODEL})"
+    )
+    parser.add_argument(
+        "--dpi", type=int, default=DPI,
+        help=f"PDF render resolution (default: {DPI})"
+    )
+    args = parser.parse_args()
+
+    pdf_path    = Path(args.input)
+    output_path = Path(args.output) if args.output \
+                  else pdf_path.with_name(pdf_path.stem + "_ocr.txt")
+
+    if not pdf_path.is_file():
+        sys.exit(f"Error: file not found: {pdf_path}")
+
+    print(f"\n[*] PDF     : {pdf_path}")
+    print(f"[*] Model   : {args.model}")
+    print(f"[*] Ollama  : {args.host}")
+    print(f"[*] Type    : {args.type or 'auto-detect per page'}")
+    print(f"[*] Output  : {output_path}\n")
+
+    print("[*] Converting PDF to images...")
+    pages = convert_from_path(str(pdf_path), dpi=args.dpi)
+    print(f"    {len(pages)} page(s) found.\n")
+
+    sections = []
+    for i, page_img in enumerate(pages, start=1):
+        doc_type, text = ocr_page(args.host, args.model, page_img, i, args.type)
+        header = f"{'='*60}\nPage {i}  |  Type: {doc_type}\n{'='*60}"
+        sections.append(f"{header}\n\n{text}")
+        print()
+
+    output_path.write_text("\n\n".join(sections), encoding="utf-8")
+    print(f"[✓] Done. Output saved to: {output_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
+pdf2image>=1.17.0
+Pillow>=10.0.0