Skip to main content

Installation

pip install parsefy fastapi python-multipart uvicorn
  • parsefy: Parsefy SDK for document extraction
  • fastapi: Modern, high-performance web framework for Python
  • python-multipart: Required by FastAPI for file upload handling
  • uvicorn: Lightning-fast ASGI server to run FastAPI

Environment setup

export PARSEFY_API_KEY=pk_your_api_key

Basic setup

from fastapi import FastAPI, UploadFile, HTTPException
from parsefy import Parsefy, APIError
from pydantic import BaseModel, Field

app = FastAPI()
client = Parsefy()

class Invoice(BaseModel):
    # REQUIRED - triggers fallback if below confidence threshold
    invoice_number: str = Field(description="The invoice number")
    total: float = Field(description="Total amount including tax")
    
    # OPTIONAL - won't trigger fallback if missing
    date: str | None = Field(default=None, description="Invoice date")
    vendor: str | None = Field(default=None, description="Vendor name")

@app.post("/extract")
async def extract_invoice(file: UploadFile):
    contents = await file.read()
    
    result = await client.extract_async(
        file=contents,
        schema=Invoice,
        confidence_threshold=0.85,  # default
        enable_verification=True  # Enable math verification
    )
    
    if result.error:
        raise HTTPException(status_code=422, detail=result.error.message)
    
    return {
        "data": result.data.model_dump(),
        "confidence": result.meta.confidence_score if result.meta else None,
        "field_confidence": [fc.model_dump() for fc in result.meta.field_confidence] if result.meta else [],
        "credits": result.metadata.credits,
        "fallback_triggered": result.metadata.fallback_triggered,
        "verification": result.verification.model_dump() if result.verification else None,
    }
Run with:
uvicorn main:app --reload

Multiple document types

from enum import Enum
from fastapi import FastAPI, UploadFile, HTTPException, Query
from parsefy import Parsefy
from pydantic import BaseModel, Field

app = FastAPI()
client = Parsefy()

class DocumentType(str, Enum):
    invoice = "invoice"
    receipt = "receipt"
    bill = "bill"

class Invoice(BaseModel):
    # REQUIRED
    invoice_number: str = Field(description="The invoice number")
    total: float = Field(description="Total amount")
    # OPTIONAL
    date: str | None = Field(default=None, description="Invoice date")
    vendor: str | None = Field(default=None, description="Vendor name")

class Receipt(BaseModel):
    # REQUIRED
    merchant: str = Field(description="Merchant name")
    total: float = Field(description="Total paid")
    # OPTIONAL
    date: str | None = Field(default=None, description="Transaction date")
    items: list[dict] | None = Field(default=None, description="Purchased items")

class Bill(BaseModel):
    # REQUIRED
    provider: str = Field(description="Service provider")
    amount_due: float = Field(description="Amount due")
    # OPTIONAL
    due_date: str | None = Field(default=None, description="Due date")
    account_number: str | None = Field(default=None, description="Account number")

SCHEMAS = {
    DocumentType.invoice: Invoice,
    DocumentType.receipt: Receipt,
    DocumentType.bill: Bill,
}

@app.post("/extract")
async def extract_document(
    file: UploadFile,
    doc_type: DocumentType = Query(default=DocumentType.invoice),
    confidence_threshold: float = Query(default=0.85, ge=0.0, le=1.0),
    enable_verification: bool = Query(default=True)
):
    contents = await file.read()
    schema = SCHEMAS[doc_type]
    
    result = await client.extract_async(
        file=contents,
        schema=schema,
        confidence_threshold=confidence_threshold,
        enable_verification=enable_verification
    )
    
    if result.error:
        raise HTTPException(status_code=422, detail=result.error.message)
    
    # Check for low confidence fields
    low_confidence = [fc for fc in result.meta.field_confidence if fc.score < 0.80] if result.meta else []
    
    return {
        "type": doc_type.value,
        "data": result.data.model_dump(),
        "meta": {
            "confidence": result.meta.confidence_score if result.meta else None,
            "field_confidence": [fc.model_dump() for fc in result.meta.field_confidence] if result.meta else [],
            "low_confidence_fields": [fc.model_dump() for fc in low_confidence],
            "processing_time_ms": result.metadata.processing_time_ms,
            "credits": result.metadata.credits,
            "fallback_triggered": result.metadata.fallback_triggered,
        },
        "verification": result.verification.model_dump() if result.verification else None,
    }

With background tasks

Process documents asynchronously:
from fastapi import FastAPI, UploadFile, BackgroundTasks
from parsefy import Parsefy
from pydantic import BaseModel, Field
import uuid

app = FastAPI()
client = Parsefy()

# In-memory store (use Redis/DB in production)
results = {}

class Invoice(BaseModel):
    invoice_number: str = Field(description="The invoice number")
    total: float = Field(description="Total amount")
    vendor: str | None = Field(default=None, description="Vendor name")

async def process_document(job_id: str, contents: bytes, confidence_threshold: float, enable_verification: bool):
    result = await client.extract_async(
        file=contents,
        schema=Invoice,
        confidence_threshold=confidence_threshold,
        enable_verification=enable_verification
    )
    
    if result.error:
        results[job_id] = {"status": "failed", "error": result.error.message}
    else:
        results[job_id] = {
            "status": "completed",
            "data": result.data.model_dump(),
            "confidence": result.meta.confidence_score if result.meta else None,
            "field_confidence": [fc.model_dump() for fc in result.meta.field_confidence] if result.meta else [],
            "verification": result.verification.model_dump() if result.verification else None,
        }

@app.post("/extract/async")
async def extract_async(
    file: UploadFile,
    background_tasks: BackgroundTasks,
    confidence_threshold: float = 0.85,
    enable_verification: bool = True
):
    job_id = str(uuid.uuid4())
    contents = await file.read()
    
    results[job_id] = {"status": "processing"}
    background_tasks.add_task(process_document, job_id, contents, confidence_threshold, enable_verification)
    
    return {"job_id": job_id}

@app.get("/extract/status/{job_id}")
async def get_status(job_id: str):
    if job_id not in results:
        return {"error": "Job not found"}, 404
    return results[job_id]

Testing

# Simple extraction
curl -X POST http://localhost:8000/extract \
  -F "file=@invoice.pdf"

# With document type and confidence threshold
curl -X POST "http://localhost:8000/extract?doc_type=receipt&confidence_threshold=0.80" \
  -F "file=@receipt.pdf"