{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "58fab4bb-231e-48cf-8ed4-fc15a1b22845",
"showTitle": false,
"title": ""
}
},
"source": [
"
Databricks-ML-professional-S01b-Experiment-Tracking
"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {},
"inputWidgets": {},
"nuid": "5e02262c-d60e-40aa-a4e9-39b9743b00b5",
"showTitle": false,
"title": ""
}
},
"source": [
"\n",
"
\n",
"This Notebook adds information related to the following requirements:
\n",
"Experiment Tracking:\n",
"\n",
"- Manually log parameters, models, and evaluation metrics using MLflow
\n",
"- Programmatically access and use data, metadata, and models from MLflow experiments
\n",
"
\n",
"
\n",
"Download this notebook at format ipynb here.
\n",
"
\n",
""
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "b5f6d0da-1d81-4fa0-9770-a9e4d6863534",
"showTitle": false,
"title": ""
}
},
"source": [
"\n",
"1. Import libraries
"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "8a2d2e59-7426-4d5f-8d97-3dcff6e5151d",
"showTitle": false,
"title": ""
}
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"#\n",
"from pyspark.sql.functions import *\n",
"#\n",
"from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler\n",
"from pyspark.ml.regression import GBTRegressor\n",
"from pyspark.ml.evaluation import RegressionEvaluator\n",
"from pyspark.ml import Pipeline\n",
"#\n",
"import mlflow\n",
"#\n",
"import logging"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "76e27ecd-7d1c-49ea-93bf-e6056ef8f623",
"showTitle": false,
"title": ""
}
},
"outputs": [],
"source": [
"logging.getLogger(\"mlflow\").setLevel(logging.FATAL)"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "aa08db2c-a856-4c86-81fe-9a8b7322cd6a",
"showTitle": false,
"title": ""
}
},
"source": [
"\n",
"2. Load dataset, convert to Spark DataFrame
"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "5b64ff08-1603-4d0c-bc4e-19c0094c3b9c",
"showTitle": false,
"title": ""
}
},
"outputs": [
{
"data": {
"text/html": [
"total_bill | tip | sex | smoker | day | time | size |
---|
16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
21.01 | 3.5 | Male | No | Sun | Dinner | 3 |
23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [
[
16.99,
1.01,
"Female",
"No",
"Sun",
"Dinner",
2
],
[
10.34,
1.66,
"Male",
"No",
"Sun",
"Dinner",
3
],
[
21.01,
3.5,
"Male",
"No",
"Sun",
"Dinner",
3
],
[
23.68,
3.31,
"Male",
"No",
"Sun",
"Dinner",
2
],
[
24.59,
3.61,
"Female",
"No",
"Sun",
"Dinner",
4
]
],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{}",
"name": "total_bill",
"type": "\"double\""
},
{
"metadata": "{}",
"name": "tip",
"type": "\"double\""
},
{
"metadata": "{}",
"name": "sex",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "smoker",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "day",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "time",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "size",
"type": "\"long\""
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"tips_df = sns.load_dataset(\"tips\")\n",
"#\n",
"tips_sdf = spark.createDataFrame(tips_df)\n",
"#\n",
"display(tips_sdf.limit(5))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "d2d00502-ebc9-47fd-8026-2f93efa06258",
"showTitle": false,
"title": ""
}
},
"outputs": [
{
"data": {
"text/html": [
"total_bill | tip | sex | smoker | day | time | size |
---|
"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{}",
"name": "total_bill",
"type": "\"double\""
},
{
"metadata": "{}",
"name": "tip",
"type": "\"double\""
},
{
"metadata": "{}",
"name": "sex",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "smoker",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "day",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "time",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "size",
"type": "\"long\""
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"display(tips_sdf.filter(\"size is null\"))"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "2b595b34-0633-4f66-9ca0-6067f4cc0716",
"showTitle": false,
"title": ""
}
},
"source": [
"\n",
"3. Prepare data
"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "453316e6-0dc3-41b0-9730-27c39ed9bdf1",
"showTitle": false,
"title": ""
}
},
"source": [
"Some transformations are done to prepare dataset to be used in training a ML model.
\n",
"\n",
"\n",
" column name | \n",
" comment | \n",
"
\n",
"\n",
" tip | \n",
" target to predict. Contains numeric | \n",
"
\n",
"\n",
" total_bill | \n",
" numeric column to keep as is | \n",
"
\n",
"\n",
" sex | \n",
" Contains Female and Male converted to 0 and 1 | \n",
"
\n",
"\n",
" smoker | \n",
" Contains yes and no converted to 0 and 1 | \n",
"
\n",
"\n",
" time | \n",
" Contains Dinner and Lunch converted to 0 and 1 | \n",
"
\n",
"\n",
" day | \n",
" categorical column to One Hot Encode | \n",
"
\n",
"\n",
" size | \n",
" categorical column to One Hot Encode | \n",
"
\n",
"
"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "92c6fbbf-0a08-4fee-8ad7-abdf5a0f9ea4",
"showTitle": false,
"title": ""
}
},
"outputs": [],
"source": [
"tips_sdf = tips_sdf.selectExpr(\"total_bill\",\n",
" \"tip\",\n",
" \"case when sex = 'Female' then 1 else 0 end as sex\",\n",
" \"case when smoker = 'yes' then 1 else 0 end as smoker\",\n",
" \"case when time = 'Dinner' then 1 else 0 end as time\",\n",
" \"day\",\n",
" \"size\")\n",
"#\n",
"train_df, test_df = tips_sdf.randomSplit([.8, .2])\n",
"#\n",
"ohe_cols = [\"size\", \"day\"]\n",
"num_cols = [\"total_bill\", \"sex\", \"smoker\", \"time\"]\n",
"target_col = \"tip\"\n",
"#\n",
"string_indexer = StringIndexer(inputCols=ohe_cols, outputCols=[c+\"_index\" for c in ohe_cols], handleInvalid=\"skip\")\n",
"#\n",
"ohe = OneHotEncoder()\n",
"ohe.setInputCols([c+\"_index\" for c in ohe_cols])\n",
"ohe.setOutputCols([c+\"_ohe\" for c in ohe_cols])\n",
"#\n",
"assembler_inputs = [c+\"_ohe\" for c in ohe_cols] + num_cols\n",
"vec_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol=\"features\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "910af898-da90-4e26-a856-cdb4b902e101",
"showTitle": false,
"title": ""
}
},
"source": [
"\n",
"4. Evaluator and model
"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "06212c8c-e7bf-45e7-827f-fd3fcad64486",
"showTitle": false,
"title": ""
}
},
"outputs": [],
"source": [
"gbt = GBTRegressor(featuresCol=\"features\", labelCol=target_col, maxIter=5)\n",
"evaluator = RegressionEvaluator(labelCol=target_col, predictionCol=\"prediction\", metricName=\"rmse\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "cd1fc5a1-c77d-45e4-88b2-d2861900b3e5",
"showTitle": false,
"title": ""
}
},
"source": [
"\n",
"\n",
"5. Manually log parameters, models, and evaluation metrics using MLflow
"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "c0a1d573-d054-48bb-864a-fb9eab2efaa3",
"showTitle": false,
"title": ""
}
},
"outputs": [],
"source": [
"model_name = \"GBT-Regressor\"\n",
"#\n",
"with mlflow.start_run(run_name=\"Tip-run\") as run:\n",
" #\n",
" # define pipeline stages according to model\n",
" stages = [string_indexer, ohe, vec_assembler, gbt]\n",
" #\n",
" # set pipeline\n",
" pipeline = Pipeline(stages=stages)\n",
" #\n",
" # fit pipeline to train set\n",
" model = pipeline.fit(train_df)\n",
" #\n",
" # manually log model to mlflow\n",
" mlflow.spark.log_model(model, model_name)\n",
" #\n",
" # manually log parameter to mlflow\n",
" mlflow.log_param(\"maxIter\", 5)\n",
" #\n",
" # predict test set\n",
" pred_df = model.transform(test_df)\n",
" #\n",
" # evaluate prediction\n",
" rmse = evaluator.evaluate(pred_df)\n",
" #\n",
" # manually log metric to mlflow\n",
" mlflow.log_metric(\"rmse\", rmse)"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {},
"inputWidgets": {},
"nuid": "b7c1ad7c-c381-4758-bb59-5114ba6f0ba3",
"showTitle": false,
"title": ""
}
},
"source": [
"\n",
"\n",
"6. Programmatically access and use data, metadata, and models from MLflow experiments
"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {},
"inputWidgets": {},
"nuid": "89edcb39-0b90-44ca-b6fd-5af69c3115a3",
"showTitle": false,
"title": ""
}
},
"source": [
"This can be done in different ways. One of them is to access it programmaticaly with the function mlflow.search_runs
which results in a Pandas dataframe containing all useful information for all runs in the current experiment (by default, the current experiment has the name of the current notebook):
"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "e5839d28-4117-400d-9a8c-d7fa5fbd0665",
"showTitle": false,
"title": ""
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" run_id | \n",
" experiment_id | \n",
" status | \n",
" artifact_uri | \n",
" start_time | \n",
" end_time | \n",
" metrics.rmse | \n",
" params.maxIter | \n",
" tags.mlflow.databricks.cluster.libraries.error | \n",
" tags.mlflow.databricks.notebookRevisionID | \n",
" tags.mlflow.databricks.workspaceID | \n",
" tags.mlflow.databricks.notebookID | \n",
" tags.sparkDatasourceInfo | \n",
" tags.mlflow.log-model.history | \n",
" tags.mlflow.databricks.notebook.commandID | \n",
" tags.mlflow.source.type | \n",
" tags.mlflow.databricks.webappURL | \n",
" tags.mlflow.runName | \n",
" tags.mlflow.databricks.cluster.info | \n",
" tags.mlflow.databricks.cluster.id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 3b461072c96e4ce492250c86e5d4b04a | \n",
" 4106912166953874 | \n",
" FINISHED | \n",
" dbfs:/databricks/mlflow-tracking/4106912166953... | \n",
" 2023-11-22 16:47:43.155000+00:00 | \n",
" 2023-11-22 16:48:13.660000+00:00 | \n",
" 1.685712 | \n",
" 5 | \n",
" This message class grpc_shaded.com.databricks.... | \n",
" 1700671694065 | \n",
" 3607579860940718 | \n",
" 4106912166953874 | \n",
" path=mlflowdbfs:/artifacts?run_id=05c39cdd803d... | \n",
" [{\"artifact_path\":\"GBT-Regressor\",\"flavors\":{\"... | \n",
" 4723686315041778941_5042614833210616149_6f6f9d... | \n",
" NOTEBOOK | \n",
" https://eastus-c3.azuredatabricks.net | \n",
" Tip-run | \n",
" {\"cluster_name\":\"Victor Bonnet's Cluster\",\"spa... | \n",
" 1027-081006-5cgi5kuh | \n",
"
\n",
" \n",
" 1 | \n",
" 05c39cdd803d41aaaf1d95e938bccb3c | \n",
" 4106912166953874 | \n",
" FINISHED | \n",
" dbfs:/databricks/mlflow-tracking/4106912166953... | \n",
" 2023-11-22 16:43:02.360000+00:00 | \n",
" 2023-11-22 16:43:32.223000+00:00 | \n",
" 1.071217 | \n",
" 5 | \n",
" This message class grpc_shaded.com.databricks.... | \n",
" 1700671412555 | \n",
" 3607579860940718 | \n",
" 4106912166953874 | \n",
" None | \n",
" [{\"artifact_path\":\"GBT-Regressor\",\"flavors\":{\"... | \n",
" 4723686315041778941_8003943757913343583_2f0694... | \n",
" NOTEBOOK | \n",
" https://eastus-c3.azuredatabricks.net | \n",
" Tip-run | \n",
" {\"cluster_name\":\"Victor Bonnet's Cluster\",\"spa... | \n",
" 1027-081006-5cgi5kuh | \n",
"
\n",
" \n",
" 2 | \n",
" 197b10148baf4c55a9cfd55cb716a45f | \n",
" 4106912166953874 | \n",
" FINISHED | \n",
" dbfs:/databricks/mlflow-tracking/4106912166953... | \n",
" 2023-11-22 16:42:20.492000+00:00 | \n",
" 2023-11-22 16:42:50.008000+00:00 | \n",
" 1.568573 | \n",
" 5 | \n",
" This message class grpc_shaded.com.databricks.... | \n",
" 1700671370330 | \n",
" 3607579860940718 | \n",
" 4106912166953874 | \n",
" None | \n",
" [{\"artifact_path\":\"GBT-Regressor\",\"flavors\":{\"... | \n",
" 4723686315041778941_8672488673836230069_a44c4b... | \n",
" NOTEBOOK | \n",
" https://eastus-c3.azuredatabricks.net | \n",
" Tip-run | \n",
" {\"cluster_name\":\"Victor Bonnet's Cluster\",\"spa... | \n",
" 1027-081006-5cgi5kuh | \n",
"
\n",
" \n",
" 3 | \n",
" 0f53885013194fdab9bc8ad6383e7bdf | \n",
" 4106912166953874 | \n",
" FINISHED | \n",
" dbfs:/databricks/mlflow-tracking/4106912166953... | \n",
" 2023-11-22 16:37:23.545000+00:00 | \n",
" 2023-11-22 16:37:53.810000+00:00 | \n",
" 1.581776 | \n",
" 5 | \n",
" This message class grpc_shaded.com.databricks.... | \n",
" 1700671074137 | \n",
" 3607579860940718 | \n",
" 4106912166953874 | \n",
" None | \n",
" [{\"artifact_path\":\"GBT-Regressor\",\"flavors\":{\"... | \n",
" 4723686315041778941_6064693358274921191_17ebf6... | \n",
" NOTEBOOK | \n",
" https://eastus-c3.azuredatabricks.net | \n",
" Tip-run | \n",
" {\"cluster_name\":\"Victor Bonnet's Cluster\",\"spa... | \n",
" 1027-081006-5cgi5kuh | \n",
"
\n",
" \n",
" 4 | \n",
" 15ba91e309d548c2a85c1f01c006e39b | \n",
" 4106912166953874 | \n",
" FINISHED | \n",
" dbfs:/databricks/mlflow-tracking/4106912166953... | \n",
" 2023-11-22 16:35:48.939000+00:00 | \n",
" 2023-11-22 16:36:25.395000+00:00 | \n",
" 1.498592 | \n",
" 5 | \n",
" This message class grpc_shaded.com.databricks.... | \n",
" 1700670985719 | \n",
" 3607579860940718 | \n",
" 4106912166953874 | \n",
" None | \n",
" [{\"artifact_path\":\"GBT-Regressor\",\"flavors\":{\"... | \n",
" 4723686315041778941_8549608798050021702_4078e9... | \n",
" NOTEBOOK | \n",
" https://eastus-c3.azuredatabricks.net | \n",
" Tip-run | \n",
" {\"cluster_name\":\"Victor Bonnet's Cluster\",\"spa... | \n",
" 1027-081006-5cgi5kuh | \n",
"
\n",
" \n",
"
\n",
"
"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"arguments": {},
"data": "\n\n
\n \n \n | \n run_id | \n experiment_id | \n status | \n artifact_uri | \n start_time | \n end_time | \n metrics.rmse | \n params.maxIter | \n tags.mlflow.databricks.cluster.libraries.error | \n tags.mlflow.databricks.notebookRevisionID | \n tags.mlflow.databricks.workspaceID | \n tags.mlflow.databricks.notebookID | \n tags.sparkDatasourceInfo | \n tags.mlflow.log-model.history | \n tags.mlflow.databricks.notebook.commandID | \n tags.mlflow.source.type | \n tags.mlflow.databricks.webappURL | \n tags.mlflow.runName | \n tags.mlflow.databricks.cluster.info | \n tags.mlflow.databricks.cluster.id | \n
\n \n \n \n 0 | \n 3b461072c96e4ce492250c86e5d4b04a | \n 4106912166953874 | \n FINISHED | \n dbfs:/databricks/mlflow-tracking/4106912166953... | \n 2023-11-22 16:47:43.155000+00:00 | \n 2023-11-22 16:48:13.660000+00:00 | \n 1.685712 | \n 5 | \n This message class grpc_shaded.com.databricks.... | \n 1700671694065 | \n 3607579860940718 | \n 4106912166953874 | \n path=mlflowdbfs:/artifacts?run_id=05c39cdd803d... | \n [{\"artifact_path\":\"GBT-Regressor\",\"flavors\":{\"... | \n 4723686315041778941_5042614833210616149_6f6f9d... | \n NOTEBOOK | \n https://eastus-c3.azuredatabricks.net | \n Tip-run | \n {\"cluster_name\":\"Victor Bonnet's Cluster\",\"spa... | \n 1027-081006-5cgi5kuh | \n
\n \n 1 | \n 05c39cdd803d41aaaf1d95e938bccb3c | \n 4106912166953874 | \n FINISHED | \n dbfs:/databricks/mlflow-tracking/4106912166953... | \n 2023-11-22 16:43:02.360000+00:00 | \n 2023-11-22 16:43:32.223000+00:00 | \n 1.071217 | \n 5 | \n This message class grpc_shaded.com.databricks.... | \n 1700671412555 | \n 3607579860940718 | \n 4106912166953874 | \n None | \n [{\"artifact_path\":\"GBT-Regressor\",\"flavors\":{\"... | \n 4723686315041778941_8003943757913343583_2f0694... | \n NOTEBOOK | \n https://eastus-c3.azuredatabricks.net | \n Tip-run | \n {\"cluster_name\":\"Victor Bonnet's Cluster\",\"spa... | \n 1027-081006-5cgi5kuh | \n
\n \n 2 | \n 197b10148baf4c55a9cfd55cb716a45f | \n 4106912166953874 | \n FINISHED | \n dbfs:/databricks/mlflow-tracking/4106912166953... | \n 2023-11-22 16:42:20.492000+00:00 | \n 2023-11-22 16:42:50.008000+00:00 | \n 1.568573 | \n 5 | \n This message class grpc_shaded.com.databricks.... | \n 1700671370330 | \n 3607579860940718 | \n 4106912166953874 | \n None | \n [{\"artifact_path\":\"GBT-Regressor\",\"flavors\":{\"... | \n 4723686315041778941_8672488673836230069_a44c4b... | \n NOTEBOOK | \n https://eastus-c3.azuredatabricks.net | \n Tip-run | \n {\"cluster_name\":\"Victor Bonnet's Cluster\",\"spa... | \n 1027-081006-5cgi5kuh | \n
\n \n 3 | \n 0f53885013194fdab9bc8ad6383e7bdf | \n 4106912166953874 | \n FINISHED | \n dbfs:/databricks/mlflow-tracking/4106912166953... | \n 2023-11-22 16:37:23.545000+00:00 | \n 2023-11-22 16:37:53.810000+00:00 | \n 1.581776 | \n 5 | \n This message class grpc_shaded.com.databricks.... | \n 1700671074137 | \n 3607579860940718 | \n 4106912166953874 | \n None | \n [{\"artifact_path\":\"GBT-Regressor\",\"flavors\":{\"... | \n 4723686315041778941_6064693358274921191_17ebf6... | \n NOTEBOOK | \n https://eastus-c3.azuredatabricks.net | \n Tip-run | \n {\"cluster_name\":\"Victor Bonnet's Cluster\",\"spa... | \n 1027-081006-5cgi5kuh | \n
\n \n 4 | \n 15ba91e309d548c2a85c1f01c006e39b | \n 4106912166953874 | \n FINISHED | \n dbfs:/databricks/mlflow-tracking/4106912166953... | \n 2023-11-22 16:35:48.939000+00:00 | \n 2023-11-22 16:36:25.395000+00:00 | \n 1.498592 | \n 5 | \n This message class grpc_shaded.com.databricks.... | \n 1700670985719 | \n 3607579860940718 | \n 4106912166953874 | \n None | \n [{\"artifact_path\":\"GBT-Regressor\",\"flavors\":{\"... | \n 4723686315041778941_8549608798050021702_4078e9... | \n NOTEBOOK | \n https://eastus-c3.azuredatabricks.net | \n Tip-run | \n {\"cluster_name\":\"Victor Bonnet's Cluster\",\"spa... | \n 1027-081006-5cgi5kuh | \n
\n \n
\n
",
"datasetInfos": [],
"metadata": {},
"removedWidgets": [],
"textData": null,
"type": "htmlSandbox"
}
},
"output_type": "display_data"
}
],
"source": [
"mlflow.search_runs().drop(['tags.mlflow.databricks.workspaceURL',\n",
" 'tags.mlflow.databricks.notebookPath',\n",
" 'tags.mlflow.source.name',\n",
" 'tags.mlflow.user'], axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {},
"inputWidgets": {},
"nuid": "f04a8cf6-a501-4e11-a7af-66b9b9bd6744",
"showTitle": false,
"title": ""
}
},
"source": [
"Using Pandas syntax information can be filtered on what is needed:
"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "29d945c5-a93c-4f84-a01b-341d71e9f980",
"showTitle": false,
"title": ""
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tags.mlflow.runName | \n",
" run_id | \n",
" params.maxIter | \n",
" metrics.rmse | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" Tip-run | \n",
" 05c39cdd803d41aaaf1d95e938bccb3c | \n",
" 5 | \n",
" 1.071217 | \n",
"
\n",
" \n",
" 4 | \n",
" Tip-run | \n",
" 15ba91e309d548c2a85c1f01c006e39b | \n",
" 5 | \n",
" 1.498592 | \n",
"
\n",
" \n",
" 2 | \n",
" Tip-run | \n",
" 197b10148baf4c55a9cfd55cb716a45f | \n",
" 5 | \n",
" 1.568573 | \n",
"
\n",
" \n",
" 3 | \n",
" Tip-run | \n",
" 0f53885013194fdab9bc8ad6383e7bdf | \n",
" 5 | \n",
" 1.581776 | \n",
"
\n",
" \n",
" 0 | \n",
" Tip-run | \n",
" 3b461072c96e4ce492250c86e5d4b04a | \n",
" 5 | \n",
" 1.685712 | \n",
"
\n",
" \n",
"
\n",
"
"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"arguments": {},
"data": "\n\n
\n \n \n | \n tags.mlflow.runName | \n run_id | \n params.maxIter | \n metrics.rmse | \n
\n \n \n \n 1 | \n Tip-run | \n 05c39cdd803d41aaaf1d95e938bccb3c | \n 5 | \n 1.071217 | \n
\n \n 4 | \n Tip-run | \n 15ba91e309d548c2a85c1f01c006e39b | \n 5 | \n 1.498592 | \n
\n \n 2 | \n Tip-run | \n 197b10148baf4c55a9cfd55cb716a45f | \n 5 | \n 1.568573 | \n
\n \n 3 | \n Tip-run | \n 0f53885013194fdab9bc8ad6383e7bdf | \n 5 | \n 1.581776 | \n
\n \n 0 | \n Tip-run | \n 3b461072c96e4ce492250c86e5d4b04a | \n 5 | \n 1.685712 | \n
\n \n
\n
",
"datasetInfos": [],
"metadata": {},
"removedWidgets": [],
"textData": null,
"type": "htmlSandbox"
}
},
"output_type": "display_data"
}
],
"source": [
"mlflow.search_runs()[[\"tags.mlflow.runName\", \"run_id\", \"params.maxIter\", \"metrics.rmse\"]].sort_values(by=['metrics.rmse'], ascending=True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {},
"inputWidgets": {},
"nuid": "3ea640ea-2ab2-46f6-b53f-a440ef888681",
"showTitle": false,
"title": ""
}
},
"source": [
"A SQL filter can also be applied directly in the mlflow.search_run()
function by using its filter_string
parameter. This is particularly useful when there are many runs:
"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "9844f927-34d9-4ffc-a1f7-c8c17bafc6bb",
"showTitle": false,
"title": ""
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tags.mlflow.runName | \n",
" run_id | \n",
" params.maxIter | \n",
" metrics.rmse | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Tip-run | \n",
" 05c39cdd803d41aaaf1d95e938bccb3c | \n",
" 5 | \n",
" 1.071217 | \n",
"
\n",
" \n",
" 1 | \n",
" Tip-run | \n",
" 15ba91e309d548c2a85c1f01c006e39b | \n",
" 5 | \n",
" 1.498592 | \n",
"
\n",
" \n",
"
\n",
"
"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"arguments": {},
"data": "\n\n
\n \n \n | \n tags.mlflow.runName | \n run_id | \n params.maxIter | \n metrics.rmse | \n
\n \n \n \n 0 | \n Tip-run | \n 05c39cdd803d41aaaf1d95e938bccb3c | \n 5 | \n 1.071217 | \n
\n \n 1 | \n Tip-run | \n 15ba91e309d548c2a85c1f01c006e39b | \n 5 | \n 1.498592 | \n
\n \n
\n
",
"datasetInfos": [],
"metadata": {},
"removedWidgets": [],
"textData": null,
"type": "htmlSandbox"
}
},
"output_type": "display_data"
}
],
"source": [
"mlflow.search_runs(filter_string=\"tags.mlflow.runName like '%Tip%' and metrics.rmse<=1.5\")[[\"tags.mlflow.runName\", \"run_id\", \"params.maxIter\", \"metrics.rmse\"]]"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {},
"inputWidgets": {},
"nuid": "de6aca3d-c7f8-4d16-881b-df28551dc63e",
"showTitle": false,
"title": ""
}
},
"source": [
"With this, let's load the best model:
"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "d37c9d5a-6eb0-4eee-891f-d547ff1b08b8",
"showTitle": false,
"title": ""
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best model path is: runs:/05c39cdd803d41aaaf1d95e938bccb3c/GBT-Regressor\n"
]
}
],
"source": [
"bestModelRunId = mlflow.search_runs().sort_values(by=['metrics.rmse'], ascending=True).head(1)[\"run_id\"].values[0]\n",
"#\n",
"best_model_path = f\"runs:/{bestModelRunId}/{model_name}\"\n",
"print(f\"Best model path is: {best_model_path}\")\n",
"#\n",
"loaded_model = mlflow.spark.load_model(best_model_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "75cd5545-72e8-4678-b590-48a52a6e19c7",
"showTitle": false,
"title": ""
}
},
"outputs": [
{
"data": {
"text/html": [
"tip | prediction |
---|
1.32 | 2.2600362250689874 |
1.56 | 2.2600362250689874 |
1.57 | 2.270591780624543 |
2.0 | 2.4706720284816104 |
1.01 | 2.621464796497559 |
3.0 | 3.2463005089207058 |
2.5 | 3.1992041196791403 |
3.31 | 2.9060591599135535 |
3.12 | 4.0487121332066645 |
7.58 | 4.6646880493827165 |
1.0 | 1.3015395831199872 |
1.8 | 1.9777634977962606 |
2.0 | 1.9777634977962606 |
1.68 | 2.270591780624543 |
2.09 | 2.564973300067474 |
2.47 | 2.6026518714960454 |
4.06 | 3.227487583919192 |
4.0 | 4.361694589718562 |
4.73 | 4.687208975308642 |
2.5 | 6.218021382716049 |
1.5 | 1.5428140028467654 |
1.61 | 1.5533466844466506 |
2.0 | 1.5533466844466506 |
3.55 | 3.456976 |
2.0 | 1.5428140028467654 |
2.2 | 1.9777634977962606 |
2.01 | 1.9777634977962606 |
2.23 | 1.9882961793961458 |
2.0 | 1.9777634977962606 |
1.58 | 2.270591780624543 |
3.48 | 2.270591780624543 |
2.0 | 2.270591780624543 |
2.02 | 2.270591780624543 |
3.0 | 2.564973300067474 |
1.5 | 2.270591780624543 |
2.5 | 3.192036225068987 |
4.19 | 3.131679615987459 |
5.0 | 5.054009020863314 |
5.16 | 4.408080999391555 |
6.5 | 5.169339707135286 |
2.0 | 3.0488064 |
2.0 | 3.427246104519053 |
1.17 | 1.6696819697747305 |
5.0 | 4.990542308641975 |
"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [
[
1.32,
2.2600362250689874
],
[
1.56,
2.2600362250689874
],
[
1.57,
2.270591780624543
],
[
2,
2.4706720284816104
],
[
1.01,
2.621464796497559
],
[
3,
3.2463005089207058
],
[
2.5,
3.1992041196791403
],
[
3.31,
2.9060591599135535
],
[
3.12,
4.0487121332066645
],
[
7.58,
4.6646880493827165
],
[
1,
1.3015395831199872
],
[
1.8,
1.9777634977962606
],
[
2,
1.9777634977962606
],
[
1.68,
2.270591780624543
],
[
2.09,
2.564973300067474
],
[
2.47,
2.6026518714960454
],
[
4.06,
3.227487583919192
],
[
4,
4.361694589718562
],
[
4.73,
4.687208975308642
],
[
2.5,
6.218021382716049
],
[
1.5,
1.5428140028467654
],
[
1.61,
1.5533466844466506
],
[
2,
1.5533466844466506
],
[
3.55,
3.456976
],
[
2,
1.5428140028467654
],
[
2.2,
1.9777634977962606
],
[
2.01,
1.9777634977962606
],
[
2.23,
1.9882961793961458
],
[
2,
1.9777634977962606
],
[
1.58,
2.270591780624543
],
[
3.48,
2.270591780624543
],
[
2,
2.270591780624543
],
[
2.02,
2.270591780624543
],
[
3,
2.564973300067474
],
[
1.5,
2.270591780624543
],
[
2.5,
3.192036225068987
],
[
4.19,
3.131679615987459
],
[
5,
5.054009020863314
],
[
5.16,
4.408080999391555
],
[
6.5,
5.169339707135286
],
[
2,
3.0488064
],
[
2,
3.427246104519053
],
[
1.17,
1.6696819697747305
],
[
5,
4.990542308641975
]
],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{}",
"name": "tip",
"type": "\"double\""
},
{
"metadata": "{\"ml_attr\":{\"attrs\":{\"numeric\":[{\"idx\":8,\"name\":\"total_bill\"},{\"idx\":9,\"name\":\"sex\"},{\"idx\":10,\"name\":\"smoker\"},{\"idx\":11,\"name\":\"time\"}],\"binary\":[{\"idx\":0,\"name\":\"size_ohe_2\"},{\"idx\":1,\"name\":\"size_ohe_3\"},{\"idx\":2,\"name\":\"size_ohe_4\"},{\"idx\":3,\"name\":\"size_ohe_5\"},{\"idx\":4,\"name\":\"size_ohe_6\"},{\"idx\":5,\"name\":\"day_ohe_Sat\"},{\"idx\":6,\"name\":\"day_ohe_Sun\"},{\"idx\":7,\"name\":\"day_ohe_Thur\"}]},\"num_attrs\":12}}",
"name": "prediction",
"type": "\"double\""
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"display(loaded_model.transform(test_df).select(\"tip\", \"prediction\"))"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {},
"inputWidgets": {},
"nuid": "67f74bab-3bc1-46ec-a68e-de198cc7b1c7",
"showTitle": false,
"title": ""
}
},
"source": [
"
"
]
}
],
"metadata": {
"application/vnd.databricks.v1+notebook": {
"dashboards": [],
"language": "python",
"notebookMetadata": {
"mostRecentlyExecutedCommandWithImplicitDF": {
"commandId": 121806328486209,
"dataframes": [
"_sqldf"
]
},
"pythonIndentUnit": 2
},
"notebookName": "Databricks-ML-professional-S01b-Experiment-Tracking",
"widgets": {}
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}