Skip to main content

Installation

pip install parsefy flask
  • parsefy: Parsefy SDK for document extraction
  • flask: Lightweight WSGI web framework for Python

Environment setup

export PARSEFY_API_KEY=pk_your_api_key

Basic setup

from flask import Flask, request, jsonify
from parsefy import Parsefy, APIError
from pydantic import BaseModel, Field

app = Flask(__name__)
client = Parsefy()

class Invoice(BaseModel):
    # REQUIRED - triggers fallback if below confidence threshold
    invoice_number: str = Field(description="The invoice number")
    total: float = Field(description="Total amount including tax")
    
    # OPTIONAL - won't trigger fallback if missing
    date: str | None = Field(default=None, description="Invoice date")
    vendor: str | None = Field(default=None, description="Vendor name")

@app.route("/extract", methods=["POST"])
def extract_invoice():
    if "file" not in request.files:
        return jsonify({"error": "No file uploaded"}), 400
    
    file = request.files["file"]
    contents = file.read()
    
    # Optional: Get confidence threshold from query param
    confidence_threshold = request.args.get("confidence_threshold", 0.85, type=float)
    
    # Optional: Get verification flag from query param
    enable_verification = request.args.get("enable_verification", "true").lower() == "true"
    
    try:
        result = client.extract(
            file=contents,
            schema=Invoice,
            confidence_threshold=confidence_threshold,
            enable_verification=enable_verification
        )
        
        if result.error:
            return jsonify({"error": result.error.message}), 422
        
        return jsonify({
            "data": result.data.model_dump(),
            "confidence": result.meta.confidence_score if result.meta else None,
            "field_confidence": [fc.model_dump() for fc in result.meta.field_confidence] if result.meta else [],
            "credits": result.metadata.credits,
            "fallback_triggered": result.metadata.fallback_triggered,
            "verification": result.verification.model_dump() if result.verification else None,
        })
    except APIError as e:
        return jsonify({"error": e.message}), e.status_code

if __name__ == "__main__":
    app.run(debug=True)

With file validation

from flask import Flask, request, jsonify
from parsefy import Parsefy
from pydantic import BaseModel, Field
from werkzeug.utils import secure_filename

app = Flask(__name__)
client = Parsefy()

ALLOWED_EXTENSIONS = {'pdf', 'docx'}
MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB

def allowed_file(filename):
    return '.' in filename and \
           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

class Invoice(BaseModel):
    # REQUIRED
    invoice_number: str = Field(description="The invoice number")
    total: float = Field(description="Total amount including tax")
    
    # OPTIONAL
    date: str | None = Field(default=None, description="Invoice date")
    vendor: str | None = Field(default=None, description="Vendor name")
    line_items: list[dict] | None = Field(default=None, description="Line items")

@app.route("/extract", methods=["POST"])
def extract_invoice():
    if "file" not in request.files:
        return jsonify({"error": "No file uploaded"}), 400
    
    file = request.files["file"]
    
    if file.filename == "":
        return jsonify({"error": "No file selected"}), 400
    
    if not allowed_file(file.filename):
        return jsonify({"error": "Only PDF and DOCX files are allowed"}), 400
    
    contents = file.read()
    
    if len(contents) > MAX_FILE_SIZE:
        return jsonify({"error": "File too large. Max 10MB"}), 400
    
    confidence_threshold = request.args.get("confidence_threshold", 0.85, type=float)
    enable_verification = request.args.get("enable_verification", "true").lower() == "true"
    
    result = client.extract(
        file=contents,
        schema=Invoice,
        confidence_threshold=confidence_threshold,
        enable_verification=enable_verification
    )
    
    if result.error:
        return jsonify({
            "error": result.error.code,
            "message": result.error.message,
        }), 422
    
    # Check for low confidence fields
    low_confidence = [fc for fc in result.meta.field_confidence if fc.score < 0.80] if result.meta else []
    
    return jsonify({
        "data": result.data.model_dump(),
        "meta": {
            "confidence": result.meta.confidence_score if result.meta else None,
            "field_confidence": [fc.model_dump() for fc in result.meta.field_confidence] if result.meta else [],
            "low_confidence_fields": [fc.model_dump() for fc in low_confidence],
            "processing_time_ms": result.metadata.processing_time_ms,
            "credits": result.metadata.credits,
            "fallback_triggered": result.metadata.fallback_triggered,
        },
        "verification": result.verification.model_dump() if result.verification else None,
    })

if __name__ == "__main__":
    app.run(debug=True)

Multiple schemas

from flask import Flask, request, jsonify
from parsefy import Parsefy
from pydantic import BaseModel, Field

app = Flask(__name__)
client = Parsefy()

class Invoice(BaseModel):
    invoice_number: str = Field(description="The invoice number")
    total: float = Field(description="Total amount")
    vendor: str | None = Field(default=None, description="Vendor name")

class Receipt(BaseModel):
    merchant: str = Field(description="Merchant name")
    total: float = Field(description="Total paid")
    date: str | None = Field(default=None, description="Transaction date")

class Bill(BaseModel):
    provider: str = Field(description="Service provider")
    amount_due: float = Field(description="Amount due")
    due_date: str | None = Field(default=None, description="Due date")

SCHEMAS = {
    "invoice": Invoice,
    "receipt": Receipt,
    "bill": Bill,
}

@app.route("/extract/<doc_type>", methods=["POST"])
def extract_document(doc_type):
    if doc_type not in SCHEMAS:
        return jsonify({
            "error": f"Invalid type. Allowed: {', '.join(SCHEMAS.keys())}"
        }), 400
    
    if "file" not in request.files:
        return jsonify({"error": "No file uploaded"}), 400
    
    file = request.files["file"]
    contents = file.read()
    schema = SCHEMAS[doc_type]
    
    confidence_threshold = request.args.get("confidence_threshold", 0.85, type=float)
    enable_verification = request.args.get("enable_verification", "true").lower() == "true"
    
    result = client.extract(
        file=contents,
        schema=schema,
        confidence_threshold=confidence_threshold,
        enable_verification=enable_verification
    )
    
    if result.error:
        return jsonify({"error": result.error.message}), 422
    
    return jsonify({
        "type": doc_type,
        "data": result.data.model_dump(),
        "confidence": result.meta.confidence_score if result.meta else None,
        "field_confidence": [fc.model_dump() for fc in result.meta.field_confidence] if result.meta else [],
        "verification": result.verification.model_dump() if result.verification else None,
    })

if __name__ == "__main__":
    app.run(debug=True)

Testing

# Extract invoice
curl -X POST http://localhost:5000/extract \
  -F "file=@invoice.pdf"

# Extract with custom confidence threshold
curl -X POST "http://localhost:5000/extract?confidence_threshold=0.90" \
  -F "file=@invoice.pdf"

# Extract with type
curl -X POST http://localhost:5000/extract/receipt \
  -F "file=@receipt.pdf"