Add JPEG and PNG image support (#2)
## Summary - Accept `.jpg`, `.jpeg`, and `.png` files in addition to `.pdf` - Images are loaded directly via Pillow — no poppler required - Unsupported extensions fail fast with a clear error message - Output header uses "Image N" for images, "Page N" for PDFs - `--dpi` and `--poppler` args apply to PDFs only (no behaviour change) ## Test plan - [ ] Run on a JPEG scan and verify output is correct - [ ] Run on a PNG and verify output is correct - [ ] Run on a PDF and verify nothing regressed - [ ] Pass an unsupported extension and verify the error message Co-authored-by: Randa <obuvuyoviz26@gmail.com> Reviewed-on: http://forgejo.localhost:3000/forgejo_admin/arabic-ocr/pulls/2
This commit is contained in:
parent
5f816bf3fa
commit
05fa727036
2 changed files with 32 additions and 17 deletions
16
README.md
16
README.md
|
|
@ -6,7 +6,7 @@ Smart Arabic OCR that auto-detects document type per page and applies tailored e
|
|||
|
||||
- Python 3.10+
|
||||
- [Ollama](https://ollama.com) running locally or on a reachable host
|
||||
- `poppler-utils` system package (for PDF rendering)
|
||||
- `poppler-utils` system package (for PDF rendering only)
|
||||
|
||||
## Setup
|
||||
|
||||
|
|
@ -72,22 +72,26 @@ Pass the model name to the script with `--model my-arabic-ocr`.
|
|||
```bash
|
||||
source arabic_ocr_env/bin/activate
|
||||
|
||||
# Auto-detect document type per page
|
||||
# PDF
|
||||
python arabic_ocr_smart.py document.pdf
|
||||
|
||||
# JPEG or PNG image
|
||||
python arabic_ocr_smart.py scan.jpg
|
||||
python arabic_ocr_smart.py photo.png
|
||||
|
||||
# Write output to a specific file
|
||||
python arabic_ocr_smart.py document.pdf output.txt
|
||||
|
||||
# Force a document type for all pages
|
||||
python arabic_ocr_smart.py scan.pdf --type handwritten|certificate|id|table|form|mixed
|
||||
|
||||
# Use a different Ollama model
|
||||
python arabic_ocr_smart.py scan.pdf --model llava:13b
|
||||
|
||||
# Use a remote Ollama host
|
||||
python arabic_ocr_smart.py scan.pdf --host http://192.168.1.10:11434
|
||||
|
||||
# Windows: point at your poppler bin\ directory
|
||||
# PDF render resolution (PDF only, default: 300)
|
||||
python arabic_ocr_smart.py scan.pdf --dpi 150
|
||||
|
||||
# Windows: point at your poppler bin\ directory (PDF only)
|
||||
python arabic_ocr_smart.py scan.pdf --poppler "C:\poppler\bin"
|
||||
```
|
||||
|
||||
|
|
|
|||
|
|
@ -7,10 +7,12 @@ handwritten text, certificates, IDs, tables, forms, and printed Arabic.
|
|||
|
||||
Usage:
|
||||
python arabic_ocr_smart.py document.pdf [output.txt]
|
||||
python arabic_ocr_smart.py scan.jpg [output.txt]
|
||||
python arabic_ocr_smart.py scan.png [output.txt]
|
||||
python arabic_ocr_smart.py scan.pdf --model qwen2.5vl:7b # default
|
||||
python arabic_ocr_smart.py scan.pdf --model llava:13b # any Ollama model
|
||||
python arabic_ocr_smart.py scan.pdf --host http://localhost:11434
|
||||
python arabic_ocr_smart.py scan.pdf --dpi 300
|
||||
python arabic_ocr_smart.py scan.pdf --dpi 300 # PDF only
|
||||
|
||||
Install:
|
||||
pip install pdf2image pillow
|
||||
|
|
@ -149,7 +151,7 @@ def main():
|
|||
parser = argparse.ArgumentParser(
|
||||
description="Arabic OCR with universal mixed-content prompting.",
|
||||
)
|
||||
parser.add_argument("input", help="Input PDF file")
|
||||
parser.add_argument("input", help="Input file: PDF, JPEG, or PNG")
|
||||
parser.add_argument("output", nargs="?", help="Output .txt file (optional)")
|
||||
parser.add_argument(
|
||||
"--host", default=DEFAULT_HOST,
|
||||
|
|
@ -177,26 +179,35 @@ def main():
|
|||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
pdf_path = Path(args.input)
|
||||
input_path = Path(args.input)
|
||||
output_path = Path(args.output) if args.output \
|
||||
else pdf_path.with_name(pdf_path.stem + "_ocr.txt")
|
||||
else input_path.with_name(input_path.stem + "_ocr.txt")
|
||||
|
||||
if not pdf_path.is_file():
|
||||
sys.exit(f"Error: file not found: {pdf_path}")
|
||||
if not input_path.is_file():
|
||||
sys.exit(f"Error: file not found: {input_path}")
|
||||
|
||||
print(f"\n[*] PDF : {pdf_path}")
|
||||
suffix = input_path.suffix.lower()
|
||||
if suffix not in {".pdf", ".jpg", ".jpeg", ".png"}:
|
||||
sys.exit(f"Error: unsupported file type '{suffix}'. Supported: .pdf, .jpg, .jpeg, .png")
|
||||
|
||||
print(f"\n[*] Input : {input_path}")
|
||||
print(f"[*] Model : {args.model}")
|
||||
print(f"[*] Ollama : {args.host}")
|
||||
print(f"[*] Output : {output_path}\n")
|
||||
|
||||
print("[*] Converting PDF to images...")
|
||||
pages = convert_from_path(str(pdf_path), dpi=args.dpi, poppler_path=args.poppler)
|
||||
print(f" {len(pages)} page(s) found.\n")
|
||||
if suffix == ".pdf":
|
||||
print("[*] Converting PDF to images...")
|
||||
pages = convert_from_path(str(input_path), dpi=args.dpi, poppler_path=args.poppler)
|
||||
print(f" {len(pages)} page(s) found.\n")
|
||||
else:
|
||||
pages = [Image.open(input_path).convert("RGB")]
|
||||
print(f"[*] Loaded image ({suffix})\n")
|
||||
|
||||
label = "Page" if suffix == ".pdf" else "Image"
|
||||
sections = []
|
||||
for i, page_img in enumerate(pages, start=1):
|
||||
text = ocr_page(args.host, args.model, page_img, i, args.timeout, args.ctx)
|
||||
header = f"{'='*60}\nPage {i}\n{'='*60}"
|
||||
header = f"{'='*60}\n{label} {i}\n{'='*60}"
|
||||
sections.append(f"{header}\n\n{text}")
|
||||
print()
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue