Initial commit: smart Arabic OCR script with document-aware prompting
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
commit
5aec8a5c6c
4 changed files with 333 additions and 0 deletions
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
arabic_ocr_env/
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
*_ocr.txt
|
||||
51
CLAUDE.md
Normal file
51
CLAUDE.md
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
# CLAUDE.md
|
||||
|
||||
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
||||
|
||||
## Setup
|
||||
|
||||
```bash
|
||||
# Create and activate the virtualenv
|
||||
python3 -m venv arabic_ocr_env
|
||||
source arabic_ocr_env/bin/activate
|
||||
|
||||
# Install dependencies
|
||||
pip install -r requirements.txt
|
||||
# System dep required for pdf2image:
|
||||
sudo apt-get install -y poppler-utils
|
||||
```
|
||||
|
||||
## Running
|
||||
|
||||
```bash
|
||||
source arabic_ocr_env/bin/activate
|
||||
|
||||
# Auto-detect document type per page
|
||||
python arabic_ocr_smart.py document.pdf [output.txt]
|
||||
|
||||
# Force a specific document type
|
||||
python arabic_ocr_smart.py scan.pdf --type [handwritten|certificate|id|table|form|mixed]
|
||||
|
||||
# Custom Ollama host (default: http://192.168.122.1:11434)
|
||||
python arabic_ocr_smart.py scan.pdf --host http://localhost:11434
|
||||
```
|
||||
|
||||
## Architecture
|
||||
|
||||
Single-file script (`arabic_ocr_smart.py`) with no tests or build system.
|
||||
|
||||
**Pipeline**: PDF → PIL images (via `pdf2image`/poppler at 300 DPI) → base64 → Ollama `/api/chat` → structured text output.
|
||||
|
||||
**Two-pass per page** (when no `--type` forced):
|
||||
1. **Detection pass** — sends the `"detect"` prompt to `qwen2.5vl:7b` to classify the page into one of: `handwritten`, `certificate`, `id`, `table`, `form`, `mixed`.
|
||||
2. **Extraction pass** — sends the type-specific prompt from the `PROMPTS` dict to extract structured text.
|
||||
|
||||
**Key constants** (top of file):
|
||||
- `DEFAULT_HOST` — Ollama endpoint (VM bridged network by default)
|
||||
- `MODEL` — `qwen2.5vl:7b`
|
||||
- `DPI` — render resolution (300)
|
||||
- `TIMEOUT` — 300 s per Ollama request
|
||||
|
||||
**Prompts** are stored in the `PROMPTS` dict. Arabic prompts are used for Arabic-language outputs (handwritten, certificate, id, form, mixed); the table prompt is English to get Markdown output. Adding a new document type means adding a key to `PROMPTS` and it is automatically available via `--type`.
|
||||
|
||||
**Output format**: pages separated by `===` headers, each labeled with page number and detected type. Default output filename is `<input_stem>_ocr.txt` alongside the input PDF.
|
||||
275
arabic_ocr_smart.py
Normal file
275
arabic_ocr_smart.py
Normal file
|
|
@ -0,0 +1,275 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Smart Arabic OCR — document-aware prompting via qwen2.5vl:7b + Ollama
|
||||
=====================================================================
|
||||
Supports: handwritten text, certificates, IDs, tables, forms, mixed docs
|
||||
|
||||
Usage:
|
||||
python arabic_ocr_smart.py document.pdf [output.txt] [--host 192.168.122.1]
|
||||
python arabic_ocr_smart.py scan.pdf --type table # force a type
|
||||
python arabic_ocr_smart.py scan.pdf --type certificate
|
||||
python arabic_ocr_smart.py scan.pdf --type id
|
||||
python arabic_ocr_smart.py scan.pdf --type handwritten
|
||||
python arabic_ocr_smart.py scan.pdf --type form
|
||||
python arabic_ocr_smart.py scan.pdf --type mixed
|
||||
python arabic_ocr_smart.py scan.pdf --model qwen2.5vl:7b # default
|
||||
python arabic_ocr_smart.py scan.pdf --model llava:13b # any Ollama model
|
||||
|
||||
Install:
|
||||
pip install pdf2image pillow
|
||||
sudo apt-get install -y poppler-utils
|
||||
"""
|
||||
|
||||
import sys
|
||||
import base64
|
||||
import json
|
||||
import io
|
||||
import argparse
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
from pathlib import Path
|
||||
from pdf2image import convert_from_path
|
||||
from PIL import Image
|
||||
|
||||
# ── Config ────────────────────────────────────────────────────────────────────
|
||||
|
||||
DEFAULT_HOST = "http://192.168.122.1:11434"
|
||||
DEFAULT_MODEL = "qwen2.5vl:7b"
|
||||
DPI = 300
|
||||
TIMEOUT = 300 # seconds per page
|
||||
|
||||
# ── Prompts per document type ─────────────────────────────────────────────────
|
||||
|
||||
PROMPTS = {
|
||||
|
||||
# ── Auto-detect: first pass to classify the page
|
||||
"detect": (
|
||||
"Look at this document image carefully. "
|
||||
"Classify it as EXACTLY ONE of these types:\n"
|
||||
" - handwritten (Arabic handwriting, notes, letters)\n"
|
||||
" - certificate (official certificates, diplomas, awards)\n"
|
||||
" - id (national ID, passport, driving licence, any ID card)\n"
|
||||
" - table (data tables, spreadsheets, structured grids)\n"
|
||||
" - form (forms with fields, checkboxes, blanks to fill)\n"
|
||||
" - mixed (combination of the above, or printed Arabic text)\n\n"
|
||||
"Reply with ONLY the single word label. No explanation."
|
||||
),
|
||||
|
||||
# ── Handwritten Arabic
|
||||
"handwritten": (
|
||||
"أنت نظام OCR متخصص في قراءة الخط العربي اليدوي. "
|
||||
"اقرأ كل النص العربي المكتوب بخط اليد في هذه الصورة بدقة تامة. "
|
||||
"حافظ على ترتيب الأسطر كما تظهر في الصورة من اليمين إلى اليسار. "
|
||||
"أبقِ التشكيل (الحركات) إن وجد. "
|
||||
"إذا كان بعض الكلام غير واضح، اكتب [غير واضح] بدلاً منه. "
|
||||
"لا تضف أي تعليق أو وصف — فقط النص المستخرج."
|
||||
),
|
||||
|
||||
# ── Official certificates / diplomas
|
||||
"certificate": (
|
||||
"أنت نظام OCR متخصص في استخراج بيانات الشهادات الرسمية. "
|
||||
"استخرج من هذه الشهادة المعلومات التالية بتنسيق منظم:\n\n"
|
||||
"نوع الشهادة: ...\n"
|
||||
"اسم الجهة المانحة: ...\n"
|
||||
"اسم صاحب الشهادة: ...\n"
|
||||
"التخصص أو المادة: ...\n"
|
||||
"التاريخ: ...\n"
|
||||
"رقم الشهادة (إن وجد): ...\n"
|
||||
"التقدير أو الدرجة (إن وجد): ...\n"
|
||||
"أي نص إضافي مهم: ...\n\n"
|
||||
"إذا لم تجد معلومة ما، اكتب 'غير متوفر'. "
|
||||
"لا تضف تعليقات خارج هذا التنسيق."
|
||||
),
|
||||
|
||||
# ── ID cards / passports / licences
|
||||
"id": (
|
||||
"أنت نظام OCR متخصص في استخراج بيانات وثائق الهوية. "
|
||||
"استخرج من هذه الوثيقة المعلومات التالية بتنسيق منظم:\n\n"
|
||||
"نوع الوثيقة: ...\n"
|
||||
"الاسم الكامل: ...\n"
|
||||
"رقم الوثيقة: ...\n"
|
||||
"تاريخ الميلاد: ...\n"
|
||||
"الجنسية: ...\n"
|
||||
"الجنس: ...\n"
|
||||
"تاريخ الإصدار: ...\n"
|
||||
"تاريخ الانتهاء: ...\n"
|
||||
"العنوان (إن وجد): ...\n"
|
||||
"أي بيانات أخرى مرئية: ...\n\n"
|
||||
"إذا لم تجد معلومة ما، اكتب 'غير متوفر'. "
|
||||
"اقرأ أيضاً المنطقة القابلة للقراءة الآلية (MRZ) في أسفل الوثيقة إن وجدت. "
|
||||
"لا تضف تعليقات خارج هذا التنسيق."
|
||||
),
|
||||
|
||||
# ── Tables / grids
|
||||
"table": (
|
||||
"You are an OCR system specializing in Arabic table extraction. "
|
||||
"Extract the complete table from this image and reproduce it in "
|
||||
"Markdown table format, preserving all rows and columns exactly. "
|
||||
"Arabic text should remain in Arabic (right-to-left content). "
|
||||
"If a cell is empty, leave it blank in the table. "
|
||||
"If there are multiple tables, separate them with a blank line and "
|
||||
"label them Table 1, Table 2, etc. "
|
||||
"Output ONLY the Markdown table(s), no other commentary."
|
||||
),
|
||||
|
||||
# ── Forms with fields
|
||||
"form": (
|
||||
"أنت نظام OCR متخصص في استخراج بيانات النماذج والاستمارات. "
|
||||
"استخرج جميع حقول هذا النموذج ومحتوياتها بالتنسيق التالي:\n\n"
|
||||
"اسم الحقل: القيمة المدخلة\n\n"
|
||||
"إذا كان الحقل فارغاً اكتب: اسم الحقل: [فارغ]\n"
|
||||
"إذا كان هناك مربعات اختيار، حدد المُختارة بـ ✓ والفارغة بـ ☐\n"
|
||||
"حافظ على الترتيب المنطقي للحقول كما تظهر في النموذج. "
|
||||
"لا تضف أي تعليق خارج هذا التنسيق."
|
||||
),
|
||||
|
||||
# ── Mixed / general printed Arabic
|
||||
"mixed": (
|
||||
"أنت نظام OCR متخصص في اللغة العربية. "
|
||||
"هذه الصورة تحتوي على مزيج من أنواع المحتوى (نص مطبوع، جداول، صور، عناوين). "
|
||||
"استخرج كل النص العربي والمعلومات المرئية بالترتيب الطبيعي للقراءة. "
|
||||
"للجداول: أعد إنتاجها بتنسيق Markdown.\n"
|
||||
"للعناوين: ضعها على سطر منفصل مع نجمتين ** حولها.\n"
|
||||
"للنص العادي: اكتبه كما هو سطراً سطراً.\n"
|
||||
"حافظ على اتجاه النص من اليمين إلى اليسار. "
|
||||
"لا تضف تعليقات خارج المحتوى المستخرج."
|
||||
),
|
||||
}
|
||||
|
||||
VALID_TYPES = [t for t in PROMPTS if t != "detect"]
|
||||
|
||||
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
def image_to_base64(pil_image: Image.Image) -> str:
|
||||
buf = io.BytesIO()
|
||||
pil_image.save(buf, format="PNG")
|
||||
return base64.b64encode(buf.getvalue()).decode()
|
||||
|
||||
|
||||
def call_ollama(host: str, model: str, prompt: str, pil_image: Image.Image) -> str:
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
"images": [image_to_base64(pil_image)],
|
||||
}
|
||||
],
|
||||
"stream": False,
|
||||
}
|
||||
data = json.dumps(payload).encode()
|
||||
req = urllib.request.Request(
|
||||
f"{host}/api/chat",
|
||||
data=data,
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
|
||||
result = json.loads(resp.read())
|
||||
return result["message"]["content"].strip()
|
||||
except urllib.error.URLError as e:
|
||||
raise ConnectionError(
|
||||
f"Cannot reach Ollama at {host}. "
|
||||
f"Is it running? (OLLAMA_HOST=0.0.0.0:11434 ollama serve)\n{e}"
|
||||
)
|
||||
|
||||
|
||||
def detect_type(host: str, model: str, pil_image: Image.Image) -> str:
|
||||
"""Ask the model to classify the document, return one of VALID_TYPES."""
|
||||
raw = call_ollama(host, model, PROMPTS["detect"], pil_image).lower().strip()
|
||||
# Accept the first word that matches a known type
|
||||
for word in raw.split():
|
||||
word = word.strip(".,;:'\"")
|
||||
if word in VALID_TYPES:
|
||||
return word
|
||||
# Fallback
|
||||
return "mixed"
|
||||
|
||||
|
||||
def ocr_page(
|
||||
host: str,
|
||||
model: str,
|
||||
pil_image: Image.Image,
|
||||
page_num: int,
|
||||
forced_type: str | None,
|
||||
) -> tuple[str, str]:
|
||||
"""
|
||||
Returns (doc_type, extracted_text).
|
||||
If forced_type is given, skips detection.
|
||||
"""
|
||||
if forced_type:
|
||||
doc_type = forced_type
|
||||
print(f" Page {page_num}: type forced → {doc_type}", flush=True)
|
||||
else:
|
||||
print(f" Page {page_num}: detecting type...", end=" ", flush=True)
|
||||
doc_type = detect_type(host, model, pil_image)
|
||||
print(f"→ {doc_type}", flush=True)
|
||||
|
||||
print(f" Page {page_num}: extracting text...", end=" ", flush=True)
|
||||
text = call_ollama(host, model, PROMPTS[doc_type], pil_image)
|
||||
print("done.", flush=True)
|
||||
return doc_type, text
|
||||
|
||||
|
||||
# ── Main ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Smart Arabic OCR with document-aware prompting.",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=(
|
||||
"Document types: " + ", ".join(VALID_TYPES) + "\n"
|
||||
"If --type is omitted, each page is auto-detected."
|
||||
),
|
||||
)
|
||||
parser.add_argument("input", help="Input PDF file")
|
||||
parser.add_argument("output", nargs="?", help="Output .txt file (optional)")
|
||||
parser.add_argument(
|
||||
"--type", choices=VALID_TYPES, default=None,
|
||||
help="Force a document type for all pages (skips auto-detection)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--host", default=DEFAULT_HOST,
|
||||
help=f"Ollama host URL (default: {DEFAULT_HOST})"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model", default=DEFAULT_MODEL,
|
||||
help=f"Ollama model name (default: {DEFAULT_MODEL})"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dpi", type=int, default=DPI,
|
||||
help=f"PDF render resolution (default: {DPI})"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
pdf_path = Path(args.input)
|
||||
output_path = Path(args.output) if args.output \
|
||||
else pdf_path.with_name(pdf_path.stem + "_ocr.txt")
|
||||
|
||||
if not pdf_path.is_file():
|
||||
sys.exit(f"Error: file not found: {pdf_path}")
|
||||
|
||||
print(f"\n[*] PDF : {pdf_path}")
|
||||
print(f"[*] Model : {args.model}")
|
||||
print(f"[*] Ollama : {args.host}")
|
||||
print(f"[*] Type : {args.type or 'auto-detect per page'}")
|
||||
print(f"[*] Output : {output_path}\n")
|
||||
|
||||
print("[*] Converting PDF to images...")
|
||||
pages = convert_from_path(str(pdf_path), dpi=args.dpi)
|
||||
print(f" {len(pages)} page(s) found.\n")
|
||||
|
||||
sections = []
|
||||
for i, page_img in enumerate(pages, start=1):
|
||||
doc_type, text = ocr_page(args.host, args.model, page_img, i, args.type)
|
||||
header = f"{'='*60}\nPage {i} | Type: {doc_type}\n{'='*60}"
|
||||
sections.append(f"{header}\n\n{text}")
|
||||
print()
|
||||
|
||||
output_path.write_text("\n\n".join(sections), encoding="utf-8")
|
||||
print(f"[✓] Done. Output saved to: {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
2
requirements.txt
Normal file
2
requirements.txt
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
pdf2image>=1.17.0
|
||||
Pillow>=10.0.0
|
||||
Loading…
Add table
Reference in a new issue