Installation
Copy
pip install parsefy flask
- parsefy: Parsefy SDK for document extraction
- flask: Lightweight WSGI web framework for Python
Environment setup
Copy
export PARSEFY_API_KEY=pk_your_api_key
Basic setup
Copy
from flask import Flask, request, jsonify
from parsefy import Parsefy, APIError
from pydantic import BaseModel, Field
app = Flask(__name__)
client = Parsefy()
class Invoice(BaseModel):
# REQUIRED - triggers fallback if below confidence threshold
invoice_number: str = Field(description="The invoice number")
total: float = Field(description="Total amount including tax")
# OPTIONAL - won't trigger fallback if missing
date: str | None = Field(default=None, description="Invoice date")
vendor: str | None = Field(default=None, description="Vendor name")
@app.route("/extract", methods=["POST"])
def extract_invoice():
if "file" not in request.files:
return jsonify({"error": "No file uploaded"}), 400
file = request.files["file"]
contents = file.read()
# Optional: Get confidence threshold from query param
confidence_threshold = request.args.get("confidence_threshold", 0.85, type=float)
# Optional: Get verification flag from query param
enable_verification = request.args.get("enable_verification", "true").lower() == "true"
try:
result = client.extract(
file=contents,
schema=Invoice,
confidence_threshold=confidence_threshold,
enable_verification=enable_verification
)
if result.error:
return jsonify({"error": result.error.message}), 422
return jsonify({
"data": result.data.model_dump(),
"confidence": result.meta.confidence_score if result.meta else None,
"field_confidence": [fc.model_dump() for fc in result.meta.field_confidence] if result.meta else [],
"credits": result.metadata.credits,
"fallback_triggered": result.metadata.fallback_triggered,
"verification": result.verification.model_dump() if result.verification else None,
})
except APIError as e:
return jsonify({"error": e.message}), e.status_code
if __name__ == "__main__":
app.run(debug=True)
With file validation
Copy
from flask import Flask, request, jsonify
from parsefy import Parsefy
from pydantic import BaseModel, Field
from werkzeug.utils import secure_filename
app = Flask(__name__)
client = Parsefy()
ALLOWED_EXTENSIONS = {'pdf', 'docx'}
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB
def allowed_file(filename):
return '.' in filename and \
filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
class Invoice(BaseModel):
# REQUIRED
invoice_number: str = Field(description="The invoice number")
total: float = Field(description="Total amount including tax")
# OPTIONAL
date: str | None = Field(default=None, description="Invoice date")
vendor: str | None = Field(default=None, description="Vendor name")
line_items: list[dict] | None = Field(default=None, description="Line items")
@app.route("/extract", methods=["POST"])
def extract_invoice():
if "file" not in request.files:
return jsonify({"error": "No file uploaded"}), 400
file = request.files["file"]
if file.filename == "":
return jsonify({"error": "No file selected"}), 400
if not allowed_file(file.filename):
return jsonify({"error": "Only PDF and DOCX files are allowed"}), 400
contents = file.read()
if len(contents) > MAX_FILE_SIZE:
return jsonify({"error": "File too large. Max 10MB"}), 400
confidence_threshold = request.args.get("confidence_threshold", 0.85, type=float)
enable_verification = request.args.get("enable_verification", "true").lower() == "true"
result = client.extract(
file=contents,
schema=Invoice,
confidence_threshold=confidence_threshold,
enable_verification=enable_verification
)
if result.error:
return jsonify({
"error": result.error.code,
"message": result.error.message,
}), 422
# Check for low confidence fields
low_confidence = [fc for fc in result.meta.field_confidence if fc.score < 0.80] if result.meta else []
return jsonify({
"data": result.data.model_dump(),
"meta": {
"confidence": result.meta.confidence_score if result.meta else None,
"field_confidence": [fc.model_dump() for fc in result.meta.field_confidence] if result.meta else [],
"low_confidence_fields": [fc.model_dump() for fc in low_confidence],
"processing_time_ms": result.metadata.processing_time_ms,
"credits": result.metadata.credits,
"fallback_triggered": result.metadata.fallback_triggered,
},
"verification": result.verification.model_dump() if result.verification else None,
})
if __name__ == "__main__":
app.run(debug=True)
Multiple schemas
Copy
from flask import Flask, request, jsonify
from parsefy import Parsefy
from pydantic import BaseModel, Field
app = Flask(__name__)
client = Parsefy()
class Invoice(BaseModel):
invoice_number: str = Field(description="The invoice number")
total: float = Field(description="Total amount")
vendor: str | None = Field(default=None, description="Vendor name")
class Receipt(BaseModel):
merchant: str = Field(description="Merchant name")
total: float = Field(description="Total paid")
date: str | None = Field(default=None, description="Transaction date")
class Bill(BaseModel):
provider: str = Field(description="Service provider")
amount_due: float = Field(description="Amount due")
due_date: str | None = Field(default=None, description="Due date")
SCHEMAS = {
"invoice": Invoice,
"receipt": Receipt,
"bill": Bill,
}
@app.route("/extract/<doc_type>", methods=["POST"])
def extract_document(doc_type):
if doc_type not in SCHEMAS:
return jsonify({
"error": f"Invalid type. Allowed: {', '.join(SCHEMAS.keys())}"
}), 400
if "file" not in request.files:
return jsonify({"error": "No file uploaded"}), 400
file = request.files["file"]
contents = file.read()
schema = SCHEMAS[doc_type]
confidence_threshold = request.args.get("confidence_threshold", 0.85, type=float)
enable_verification = request.args.get("enable_verification", "true").lower() == "true"
result = client.extract(
file=contents,
schema=schema,
confidence_threshold=confidence_threshold,
enable_verification=enable_verification
)
if result.error:
return jsonify({"error": result.error.message}), 422
return jsonify({
"type": doc_type,
"data": result.data.model_dump(),
"confidence": result.meta.confidence_score if result.meta else None,
"field_confidence": [fc.model_dump() for fc in result.meta.field_confidence] if result.meta else [],
"verification": result.verification.model_dump() if result.verification else None,
})
if __name__ == "__main__":
app.run(debug=True)
Testing
Copy
# Extract invoice
curl -X POST http://localhost:5000/extract \
-F "file=@invoice.pdf"
# Extract with custom confidence threshold
curl -X POST "http://localhost:5000/extract?confidence_threshold=0.90" \
-F "file=@invoice.pdf"
# Extract with type
curl -X POST http://localhost:5000/extract/receipt \
-F "file=@receipt.pdf"
