Installation
Copy
pip install parsefy fastapi python-multipart uvicorn
- parsefy: Parsefy SDK for document extraction
- fastapi: Modern, high-performance web framework for Python
- python-multipart: Required by FastAPI for file upload handling
- uvicorn: Lightning-fast ASGI server to run FastAPI
Environment setup
Copy
export PARSEFY_API_KEY=pk_your_api_key
Basic setup
Copy
from fastapi import FastAPI, UploadFile, HTTPException
from parsefy import Parsefy, APIError
from pydantic import BaseModel, Field
app = FastAPI()
client = Parsefy()
class Invoice(BaseModel):
# REQUIRED - triggers fallback if below confidence threshold
invoice_number: str = Field(description="The invoice number")
total: float = Field(description="Total amount including tax")
# OPTIONAL - won't trigger fallback if missing
date: str | None = Field(default=None, description="Invoice date")
vendor: str | None = Field(default=None, description="Vendor name")
@app.post("/extract")
async def extract_invoice(file: UploadFile):
contents = await file.read()
result = await client.extract_async(
file=contents,
schema=Invoice,
confidence_threshold=0.85, # default
enable_verification=True # Enable math verification
)
if result.error:
raise HTTPException(status_code=422, detail=result.error.message)
return {
"data": result.data.model_dump(),
"confidence": result.meta.confidence_score if result.meta else None,
"field_confidence": [fc.model_dump() for fc in result.meta.field_confidence] if result.meta else [],
"credits": result.metadata.credits,
"fallback_triggered": result.metadata.fallback_triggered,
"verification": result.verification.model_dump() if result.verification else None,
}
Copy
uvicorn main:app --reload
Multiple document types
Copy
from enum import Enum
from fastapi import FastAPI, UploadFile, HTTPException, Query
from parsefy import Parsefy
from pydantic import BaseModel, Field
app = FastAPI()
client = Parsefy()
class DocumentType(str, Enum):
invoice = "invoice"
receipt = "receipt"
bill = "bill"
class Invoice(BaseModel):
# REQUIRED
invoice_number: str = Field(description="The invoice number")
total: float = Field(description="Total amount")
# OPTIONAL
date: str | None = Field(default=None, description="Invoice date")
vendor: str | None = Field(default=None, description="Vendor name")
class Receipt(BaseModel):
# REQUIRED
merchant: str = Field(description="Merchant name")
total: float = Field(description="Total paid")
# OPTIONAL
date: str | None = Field(default=None, description="Transaction date")
items: list[dict] | None = Field(default=None, description="Purchased items")
class Bill(BaseModel):
# REQUIRED
provider: str = Field(description="Service provider")
amount_due: float = Field(description="Amount due")
# OPTIONAL
due_date: str | None = Field(default=None, description="Due date")
account_number: str | None = Field(default=None, description="Account number")
SCHEMAS = {
DocumentType.invoice: Invoice,
DocumentType.receipt: Receipt,
DocumentType.bill: Bill,
}
@app.post("/extract")
async def extract_document(
file: UploadFile,
doc_type: DocumentType = Query(default=DocumentType.invoice),
confidence_threshold: float = Query(default=0.85, ge=0.0, le=1.0),
enable_verification: bool = Query(default=True)
):
contents = await file.read()
schema = SCHEMAS[doc_type]
result = await client.extract_async(
file=contents,
schema=schema,
confidence_threshold=confidence_threshold,
enable_verification=enable_verification
)
if result.error:
raise HTTPException(status_code=422, detail=result.error.message)
# Check for low confidence fields
low_confidence = [fc for fc in result.meta.field_confidence if fc.score < 0.80] if result.meta else []
return {
"type": doc_type.value,
"data": result.data.model_dump(),
"meta": {
"confidence": result.meta.confidence_score if result.meta else None,
"field_confidence": [fc.model_dump() for fc in result.meta.field_confidence] if result.meta else [],
"low_confidence_fields": [fc.model_dump() for fc in low_confidence],
"processing_time_ms": result.metadata.processing_time_ms,
"credits": result.metadata.credits,
"fallback_triggered": result.metadata.fallback_triggered,
},
"verification": result.verification.model_dump() if result.verification else None,
}
With background tasks
Process documents asynchronously:Copy
from fastapi import FastAPI, UploadFile, BackgroundTasks
from parsefy import Parsefy
from pydantic import BaseModel, Field
import uuid
app = FastAPI()
client = Parsefy()
# In-memory store (use Redis/DB in production)
results = {}
class Invoice(BaseModel):
invoice_number: str = Field(description="The invoice number")
total: float = Field(description="Total amount")
vendor: str | None = Field(default=None, description="Vendor name")
async def process_document(job_id: str, contents: bytes, confidence_threshold: float, enable_verification: bool):
result = await client.extract_async(
file=contents,
schema=Invoice,
confidence_threshold=confidence_threshold,
enable_verification=enable_verification
)
if result.error:
results[job_id] = {"status": "failed", "error": result.error.message}
else:
results[job_id] = {
"status": "completed",
"data": result.data.model_dump(),
"confidence": result.meta.confidence_score if result.meta else None,
"field_confidence": [fc.model_dump() for fc in result.meta.field_confidence] if result.meta else [],
"verification": result.verification.model_dump() if result.verification else None,
}
@app.post("/extract/async")
async def extract_async(
file: UploadFile,
background_tasks: BackgroundTasks,
confidence_threshold: float = 0.85,
enable_verification: bool = True
):
job_id = str(uuid.uuid4())
contents = await file.read()
results[job_id] = {"status": "processing"}
background_tasks.add_task(process_document, job_id, contents, confidence_threshold, enable_verification)
return {"job_id": job_id}
@app.get("/extract/status/{job_id}")
async def get_status(job_id: str):
if job_id not in results:
return {"error": "Job not found"}, 404
return results[job_id]
Testing
Copy
# Simple extraction
curl -X POST http://localhost:8000/extract \
-F "file=@invoice.pdf"
# With document type and confidence threshold
curl -X POST "http://localhost:8000/extract?doc_type=receipt&confidence_threshold=0.80" \
-F "file=@receipt.pdf"
