Add JPEG and PNG image support (#2)

## Summary - Accept `.jpg`, `.jpeg`, and `.png` files in addition to `.pdf` - Images are loaded directly via Pillow — no poppler required - Unsupported extensions fail fast with a clear error message - Output header uses "Image N" for images, "Page N" for PDFs - `--dpi` and `--poppler` args apply to PDFs only (no behaviour change) ## Test plan - [ ] Run on a JPEG scan and verify output is correct - [ ] Run on a PNG and verify output is correct - [ ] Run on a PDF and verify nothing regressed - [ ] Pass an unsupported extension and verify the error message Co-authored-by: Randa <obuvuyoviz26@gmail.com> Reviewed-on: http://forgejo.localhost:3000/forgejo_admin/arabic-ocr/pulls/2
2026-06-26 22:22:49 +04:00 · 2026-06-26 22:22:49 +04:00 · 05fa727036
commit 05fa727036
parent 5f816bf3fa
2 changed files with 32 additions and 17 deletions
--- a/README.md
+++ b/README.md
@ -6,7 +6,7 @@ Smart Arabic OCR that auto-detects document type per page and applies tailored e

 - Python 3.10+
 - [Ollama](https://ollama.com) running locally or on a reachable host
- `poppler-utils` system package (for PDF rendering)
+- `poppler-utils` system package (for PDF rendering only)

 ## Setup

@ -72,22 +72,26 @@ Pass the model name to the script with `--model my-arabic-ocr`.
 ```bash
 source arabic_ocr_env/bin/activate

-# Auto-detect document type per page
+# PDF
 python arabic_ocr_smart.py document.pdf

+# JPEG or PNG image
+python arabic_ocr_smart.py scan.jpg
+python arabic_ocr_smart.py photo.png
+
 # Write output to a specific file
 python arabic_ocr_smart.py document.pdf output.txt

-# Force a document type for all pages
-python arabic_ocr_smart.py scan.pdf --type handwritten|certificate|id|table|form|mixed
-
 # Use a different Ollama model
 python arabic_ocr_smart.py scan.pdf --model llava:13b

 # Use a remote Ollama host
 python arabic_ocr_smart.py scan.pdf --host http://192.168.1.10:11434

-# Windows: point at your poppler bin\ directory
+# PDF render resolution (PDF only, default: 300)
+python arabic_ocr_smart.py scan.pdf --dpi 150
+
+# Windows: point at your poppler bin\ directory (PDF only)
 python arabic_ocr_smart.py scan.pdf --poppler "C:\poppler\bin"
 ```

--- a/arabic_ocr_smart.py
+++ b/arabic_ocr_smart.py
@ -7,10 +7,12 @@ handwritten text, certificates, IDs, tables, forms, and printed Arabic.

 Usage:
    python arabic_ocr_smart.py document.pdf [output.txt]
+    python arabic_ocr_smart.py scan.jpg [output.txt]
+    python arabic_ocr_smart.py scan.png [output.txt]
    python arabic_ocr_smart.py scan.pdf --model qwen2.5vl:7b    # default
    python arabic_ocr_smart.py scan.pdf --model llava:13b        # any Ollama model
    python arabic_ocr_smart.py scan.pdf --host http://localhost:11434
-    python arabic_ocr_smart.py scan.pdf --dpi 300
+    python arabic_ocr_smart.py scan.pdf --dpi 300                # PDF only

 Install:
    pip install pdf2image pillow
@ -149,7 +151,7 @@ def main():
    parser = argparse.ArgumentParser(
        description="Arabic OCR with universal mixed-content prompting.",
    )
-    parser.add_argument("input",  help="Input PDF file")
+    parser.add_argument("input",  help="Input file: PDF, JPEG, or PNG")
    parser.add_argument("output", nargs="?", help="Output .txt file (optional)")
    parser.add_argument(
        "--host", default=DEFAULT_HOST,
@ -177,26 +179,35 @@ def main():
    )
    args = parser.parse_args()

-    pdf_path    = Path(args.input)
+    input_path  = Path(args.input)
    output_path = Path(args.output) if args.output \
-                  else pdf_path.with_name(pdf_path.stem + "_ocr.txt")
+                  else input_path.with_name(input_path.stem + "_ocr.txt")

-    if not pdf_path.is_file():
-        sys.exit(f"Error: file not found: {pdf_path}")
+    if not input_path.is_file():
+        sys.exit(f"Error: file not found: {input_path}")

-    print(f"\n[*] PDF    : {pdf_path}")
+    suffix = input_path.suffix.lower()
+    if suffix not in {".pdf", ".jpg", ".jpeg", ".png"}:
+        sys.exit(f"Error: unsupported file type '{suffix}'. Supported: .pdf, .jpg, .jpeg, .png")
+
+    print(f"\n[*] Input  : {input_path}")
    print(f"[*] Model  : {args.model}")
    print(f"[*] Ollama : {args.host}")
    print(f"[*] Output : {output_path}\n")

-    print("[*] Converting PDF to images...")
-    pages = convert_from_path(str(pdf_path), dpi=args.dpi, poppler_path=args.poppler)
-    print(f"    {len(pages)} page(s) found.\n")
+    if suffix == ".pdf":
+        print("[*] Converting PDF to images...")
+        pages = convert_from_path(str(input_path), dpi=args.dpi, poppler_path=args.poppler)
+        print(f"    {len(pages)} page(s) found.\n")
+    else:
+        pages = [Image.open(input_path).convert("RGB")]
+        print(f"[*] Loaded image ({suffix})\n")

+    label  = "Page" if suffix == ".pdf" else "Image"
    sections = []
    for i, page_img in enumerate(pages, start=1):
        text = ocr_page(args.host, args.model, page_img, i, args.timeout, args.ctx)
-        header = f"{'='*60}\nPage {i}\n{'='*60}"
+        header = f"{'='*60}\n{label} {i}\n{'='*60}"
        sections.append(f"{header}\n\n{text}")
        print()