AI Document Processing: OCR, Extraction, and Structured Data
Transform unstructured documents into structured, actionable data
AI Document Processing
The Document Processing Challenge
Organizations deal with millions of unstructured documents:OCR with Tesseract + AI Enhancement
python
import pytesseract
from PIL import Image
import cv2def extract_text_from_image(image_path: str) -> str:
# Preprocess image for better OCR
img = cv2.imread(image_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
denoised = cv2.fastNlMeansDenoising(gray, h=10)
# Tesseract OCR
pil_img = Image.fromarray(denoised)
text = pytesseract.image_to_string(pil_img, config='--psm 6')
return text
def enhance_with_ai(raw_text: str, document_type: str) -> dict:
"""Use AI to fix OCR errors and extract structure"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "user",
"content": f"""Clean up this {document_type} OCR text and extract:
- All field names and values
- Fix obvious OCR errors
Return as structured JSON.
Raw text: {raw_text}"""
}],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
Azure Document Intelligence (Form Recognizer)
python
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredentialclient = DocumentAnalysisClient(
endpoint=AZURE_ENDPOINT,
credential=AzureKeyCredential(AZURE_KEY)
)
with open("invoice.pdf", "rb") as f:
poller = client.begin_analyze_document("prebuilt-invoice", f)
result = poller.result()
invoice = result.documents[0]
print(f"Invoice Number: {invoice.fields['InvoiceId'].value}")
print(f"Total: {invoice.fields['InvoiceTotal'].value}")
LLM-Based Extraction with Schema
python
from pydantic import BaseModel
from typing import Optional, Listclass InvoiceLineItem(BaseModel):
description: str
quantity: float
unit_price: float
total: float
class Invoice(BaseModel):
invoice_number: str
date: str
vendor_name: str
total_amount: float
line_items: List[InvoiceLineItem]
tax_amount: Optional[float] = None
def extract_invoice(text: str) -> Invoice:
response = client.beta.chat.completions.parse(
model="gpt-4o",
messages=[
{"role": "user", "content": f"Extract invoice data: {text}"}
],
response_format=Invoice
)
return response.choices[0].message.parsed
Pipeline for Scale
python
from celery import Celeryapp = Celery('document_processor')
@app.task
def process_document(doc_path: str, doc_type: str):
raw_text = extract_text_from_image(doc_path)
structured_data = extract_invoice(raw_text)
save_to_database(structured_data)
return {"status": "success", "data": structured_data.dict()}
Also available in 中文.