Serving Models from Databricks: The MLflow Registry-to-Endpoint Deployment Path
You've trained a model, logged it to MLflow, promoted it through Staging to Production in the Model Registry. Now you need something outside of Databricks to call it. A mobile app, a web service, a real-time scoring API. The path from registry entry to callable HTTP endpoint is shorter than it used to be — but there are configuration decisions that significantly affect performance and cost.
Creating a Serving Endpoint from the Registry
import requests
DATABRICKS_HOST = "https://your-workspace.azuredatabricks.net"
TOKEN = "dapi..."
def create_serving_endpoint(
endpoint_name: str,
model_name: str,
model_stage: str = "Production",
workload_size: str = "Small",
scale_to_zero: bool = True
) -> dict:
payload = {
"name": endpoint_name,
"config": {
"served_models": [{
"model_name": model_name,
"model_version": get_production_version(model_name),
"workload_size": workload_size,
"scale_to_zero_enabled": scale_to_zero
}]
}
}
resp = requests.post(
f"{DATABRICKS_HOST}/api/2.0/serving-endpoints",
headers={"Authorization": f"Bearer {TOKEN}"},
json=payload
)
resp.raise_for_status()
return resp.json()
def get_production_version(model_name: str) -> str:
from mlflow.tracking import MlflowClient
client = MlflowClient()
versions = client.get_latest_versions(model_name, stages=["Production"])
if not versions:
raise ValueError(f"No Production version found for {model_name}")
return versions[0].version
Scoring Against the Endpoint
import pandas as pd
import json
def score_batch(endpoint_name: str, records: pd.DataFrame) -> list:
resp = requests.post(
f"{DATABRICKS_HOST}/serving-endpoints/{endpoint_name}/invocations",
headers={
"Authorization": f"Bearer {TOKEN}",
"Content-Type": "application/json"
},
json={"dataframe_records": records.to_dict(orient="records")},
timeout=30
)
resp.raise_for_status()
return resp.json()["predictions"]
# Score a batch
test_df = pd.DataFrame([
{"account_age_days": 90, "transaction_amount": 2500.00, "region_code": "WEST"},
{"account_age_days": 7, "transaction_amount": 9999.99, "region_code": "EAST"}
])
predictions = score_batch("risk-classifier-serving", test_df)
Traffic Splitting for Safe Rollouts
When you promote a new model version, you can split traffic rather than doing a hard cutover:
def shift_traffic(endpoint_name: str, model_name: str, new_version: str, new_pct: int) -> None:
current_version = get_production_version(model_name)
current_pct = 100 - new_pct
payload = {
"served_models": [
{
"model_name": model_name,
"model_version": current_version,
"workload_size": "Small",
"scale_to_zero_enabled": False,
"traffic_percentage": current_pct
},
{
"model_name": model_name,
"model_version": new_version,
"workload_size": "Small",
"scale_to_zero_enabled": False,
"traffic_percentage": new_pct
}
]
}
requests.put(
f"{DATABRICKS_HOST}/api/2.0/serving-endpoints/{endpoint_name}/config",
headers={"Authorization": f"Bearer {TOKEN}"},
json=payload
).raise_for_status()
# Start with 10% on the new version
shift_traffic("risk-classifier-serving", "RiskClassifier", "5", 10)
# After validation, move to 50%
shift_traffic("risk-classifier-serving", "RiskClassifier", "5", 50)
# Full cutover
shift_traffic("risk-classifier-serving", "RiskClassifier", "5", 100)
Workload Size vs Scale-to-Zero
The tradeoffs are straightforward: Small endpoints can handle ~5-20 concurrent requests; Medium handles more; Large handles high-throughput production workloads. Scale-to-zero terminates the serving cluster when idle, then cold-starts (2-5 minutes) on the next request. For dev/staging or low-traffic models, scale-to-zero saves money. For production endpoints where latency matters on the first request, disable it and keep the endpoint warm. As always, I'm here to help.