{ "cells": [ { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "58fab4bb-231e-48cf-8ed4-fc15a1b22845", "showTitle": false, "title": "" } }, "source": [ "

Databricks-ML-professional-S01b-Experiment-Tracking

" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "5e02262c-d60e-40aa-a4e9-39b9743b00b5", "showTitle": false, "title": "" } }, "source": [ "
\n", "
\n", "

This Notebook adds information related to the following requirements:


\n", "Experiment Tracking:\n", "\n", "
\n", "

Download this notebook at format ipynb here.

\n", "
\n", "
" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "b5f6d0da-1d81-4fa0-9770-a9e4d6863534", "showTitle": false, "title": "" } }, "source": [ "
\n", "1. Import libraries
" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "8a2d2e59-7426-4d5f-8d97-3dcff6e5151d", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "import pandas as pd\n", "import seaborn as sns\n", "#\n", "from pyspark.sql.functions import *\n", "#\n", "from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler\n", "from pyspark.ml.regression import GBTRegressor\n", "from pyspark.ml.evaluation import RegressionEvaluator\n", "from pyspark.ml import Pipeline\n", "#\n", "import mlflow\n", "#\n", "import logging" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "76e27ecd-7d1c-49ea-93bf-e6056ef8f623", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "logging.getLogger(\"mlflow\").setLevel(logging.FATAL)" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "aa08db2c-a856-4c86-81fe-9a8b7322cd6a", "showTitle": false, "title": "" } }, "source": [ "
\n", "2. Load dataset, convert to Spark DataFrame
" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "5b64ff08-1603-4d0c-bc4e-19c0094c3b9c", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "text/html": [ "
total_billtipsexsmokerdaytimesize
16.991.01FemaleNoSunDinner2
10.341.66MaleNoSunDinner3
21.013.5MaleNoSunDinner3
23.683.31MaleNoSunDinner2
24.593.61FemaleNoSunDinner4
" ] }, "metadata": { "application/vnd.databricks.v1+output": { "addedWidgets": {}, "aggData": [], "aggError": "", "aggOverflow": false, "aggSchema": [], "aggSeriesLimitReached": false, "aggType": "", "arguments": {}, "columnCustomDisplayInfos": {}, "data": [ [ 16.99, 1.01, "Female", "No", "Sun", "Dinner", 2 ], [ 10.34, 1.66, "Male", "No", "Sun", "Dinner", 3 ], [ 21.01, 3.5, "Male", "No", "Sun", "Dinner", 3 ], [ 23.68, 3.31, "Male", "No", "Sun", "Dinner", 2 ], [ 24.59, 3.61, "Female", "No", "Sun", "Dinner", 4 ] ], "datasetInfos": [], "dbfsResultPath": null, "isJsonSchema": true, "metadata": {}, "overflow": false, "plotOptions": { "customPlotOptions": {}, "displayType": "table", "pivotAggregation": null, "pivotColumns": null, "xColumns": null, "yColumns": null }, "removedWidgets": [], "schema": [ { "metadata": "{}", "name": "total_bill", "type": "\"double\"" }, { "metadata": "{}", "name": "tip", "type": "\"double\"" }, { "metadata": "{}", "name": "sex", "type": "\"string\"" }, { "metadata": "{}", "name": "smoker", "type": "\"string\"" }, { "metadata": "{}", "name": "day", "type": "\"string\"" }, { "metadata": "{}", "name": "time", "type": "\"string\"" }, { "metadata": "{}", "name": "size", "type": "\"long\"" } ], "type": "table" } }, "output_type": "display_data" } ], "source": [ "tips_df = sns.load_dataset(\"tips\")\n", "#\n", "tips_sdf = spark.createDataFrame(tips_df)\n", "#\n", "display(tips_sdf.limit(5))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "d2d00502-ebc9-47fd-8026-2f93efa06258", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "text/html": [ "
total_billtipsexsmokerdaytimesize
" ] }, "metadata": { "application/vnd.databricks.v1+output": { "addedWidgets": {}, "aggData": [], "aggError": "", "aggOverflow": false, "aggSchema": [], "aggSeriesLimitReached": false, "aggType": "", "arguments": {}, "columnCustomDisplayInfos": {}, "data": [], "datasetInfos": [], "dbfsResultPath": null, "isJsonSchema": true, "metadata": {}, "overflow": false, "plotOptions": { "customPlotOptions": {}, "displayType": "table", "pivotAggregation": null, "pivotColumns": null, "xColumns": null, "yColumns": null }, "removedWidgets": [], "schema": [ { "metadata": "{}", "name": "total_bill", "type": "\"double\"" }, { "metadata": "{}", "name": "tip", "type": "\"double\"" }, { "metadata": "{}", "name": "sex", "type": "\"string\"" }, { "metadata": "{}", "name": "smoker", "type": "\"string\"" }, { "metadata": "{}", "name": "day", "type": "\"string\"" }, { "metadata": "{}", "name": "time", "type": "\"string\"" }, { "metadata": "{}", "name": "size", "type": "\"long\"" } ], "type": "table" } }, "output_type": "display_data" } ], "source": [ "display(tips_sdf.filter(\"size is null\"))" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "2b595b34-0633-4f66-9ca0-6067f4cc0716", "showTitle": false, "title": "" } }, "source": [ "
\n", "3. Prepare data
" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "453316e6-0dc3-41b0-9730-27c39ed9bdf1", "showTitle": false, "title": "" } }, "source": [ "

Some transformations are done to prepare dataset to be used in training a ML model.

\n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", " \n", "\n", "
column namecomment
tiptarget to predict. Contains numeric
total_billnumeric column to keep as is
sexContains Female and Male converted to 0 and 1
smokerContains yes and no converted to 0 and 1
timeContains Dinner and Lunch converted to 0 and 1
daycategorical column to One Hot Encode
sizecategorical column to One Hot Encode
" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "92c6fbbf-0a08-4fee-8ad7-abdf5a0f9ea4", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "tips_sdf = tips_sdf.selectExpr(\"total_bill\",\n", " \"tip\",\n", " \"case when sex = 'Female' then 1 else 0 end as sex\",\n", " \"case when smoker = 'yes' then 1 else 0 end as smoker\",\n", " \"case when time = 'Dinner' then 1 else 0 end as time\",\n", " \"day\",\n", " \"size\")\n", "#\n", "train_df, test_df = tips_sdf.randomSplit([.8, .2])\n", "#\n", "ohe_cols = [\"size\", \"day\"]\n", "num_cols = [\"total_bill\", \"sex\", \"smoker\", \"time\"]\n", "target_col = \"tip\"\n", "#\n", "string_indexer = StringIndexer(inputCols=ohe_cols, outputCols=[c+\"_index\" for c in ohe_cols], handleInvalid=\"skip\")\n", "#\n", "ohe = OneHotEncoder()\n", "ohe.setInputCols([c+\"_index\" for c in ohe_cols])\n", "ohe.setOutputCols([c+\"_ohe\" for c in ohe_cols])\n", "#\n", "assembler_inputs = [c+\"_ohe\" for c in ohe_cols] + num_cols\n", "vec_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol=\"features\")" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "910af898-da90-4e26-a856-cdb4b902e101", "showTitle": false, "title": "" } }, "source": [ "
\n", "4. Evaluator and model
" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "06212c8c-e7bf-45e7-827f-fd3fcad64486", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "gbt = GBTRegressor(featuresCol=\"features\", labelCol=target_col, maxIter=5)\n", "evaluator = RegressionEvaluator(labelCol=target_col, predictionCol=\"prediction\", metricName=\"rmse\")" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "cd1fc5a1-c77d-45e4-88b2-d2861900b3e5", "showTitle": false, "title": "" } }, "source": [ "\n", "
\n", "5. Manually log parameters, models, and evaluation metrics using MLflow
" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "c0a1d573-d054-48bb-864a-fb9eab2efaa3", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "model_name = \"GBT-Regressor\"\n", "#\n", "with mlflow.start_run(run_name=\"Tip-run\") as run:\n", " #\n", " # define pipeline stages according to model\n", " stages = [string_indexer, ohe, vec_assembler, gbt]\n", " #\n", " # set pipeline\n", " pipeline = Pipeline(stages=stages)\n", " #\n", " # fit pipeline to train set\n", " model = pipeline.fit(train_df)\n", " #\n", " # manually log model to mlflow\n", " mlflow.spark.log_model(model, model_name)\n", " #\n", " # manually log parameter to mlflow\n", " mlflow.log_param(\"maxIter\", 5)\n", " #\n", " # predict test set\n", " pred_df = model.transform(test_df)\n", " #\n", " # evaluate prediction\n", " rmse = evaluator.evaluate(pred_df)\n", " #\n", " # manually log metric to mlflow\n", " mlflow.log_metric(\"rmse\", rmse)" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "b7c1ad7c-c381-4758-bb59-5114ba6f0ba3", "showTitle": false, "title": "" } }, "source": [ "\n", "
\n", "6. Programmatically access and use data, metadata, and models from MLflow experiments
" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "89edcb39-0b90-44ca-b6fd-5af69c3115a3", "showTitle": false, "title": "" } }, "source": [ "

This can be done in different ways. One of them is to access it programmaticaly with the function mlflow.search_runs which results in a Pandas dataframe containing all useful information for all runs in the current experiment (by default, the current experiment has the name of the current notebook):

" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "e5839d28-4117-400d-9a8c-d7fa5fbd0665", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
run_idexperiment_idstatusartifact_uristart_timeend_timemetrics.rmseparams.maxItertags.mlflow.databricks.cluster.libraries.errortags.mlflow.databricks.notebookRevisionIDtags.mlflow.databricks.workspaceIDtags.mlflow.databricks.notebookIDtags.sparkDatasourceInfotags.mlflow.log-model.historytags.mlflow.databricks.notebook.commandIDtags.mlflow.source.typetags.mlflow.databricks.webappURLtags.mlflow.runNametags.mlflow.databricks.cluster.infotags.mlflow.databricks.cluster.id
03b461072c96e4ce492250c86e5d4b04a4106912166953874FINISHEDdbfs:/databricks/mlflow-tracking/4106912166953...2023-11-22 16:47:43.155000+00:002023-11-22 16:48:13.660000+00:001.6857125This message class grpc_shaded.com.databricks....170067169406536075798609407184106912166953874path=mlflowdbfs:/artifacts?run_id=05c39cdd803d...[{\"artifact_path\":\"GBT-Regressor\",\"flavors\":{\"...4723686315041778941_5042614833210616149_6f6f9d...NOTEBOOKhttps://eastus-c3.azuredatabricks.netTip-run{\"cluster_name\":\"Victor Bonnet's Cluster\",\"spa...1027-081006-5cgi5kuh
105c39cdd803d41aaaf1d95e938bccb3c4106912166953874FINISHEDdbfs:/databricks/mlflow-tracking/4106912166953...2023-11-22 16:43:02.360000+00:002023-11-22 16:43:32.223000+00:001.0712175This message class grpc_shaded.com.databricks....170067141255536075798609407184106912166953874None[{\"artifact_path\":\"GBT-Regressor\",\"flavors\":{\"...4723686315041778941_8003943757913343583_2f0694...NOTEBOOKhttps://eastus-c3.azuredatabricks.netTip-run{\"cluster_name\":\"Victor Bonnet's Cluster\",\"spa...1027-081006-5cgi5kuh
2197b10148baf4c55a9cfd55cb716a45f4106912166953874FINISHEDdbfs:/databricks/mlflow-tracking/4106912166953...2023-11-22 16:42:20.492000+00:002023-11-22 16:42:50.008000+00:001.5685735This message class grpc_shaded.com.databricks....170067137033036075798609407184106912166953874None[{\"artifact_path\":\"GBT-Regressor\",\"flavors\":{\"...4723686315041778941_8672488673836230069_a44c4b...NOTEBOOKhttps://eastus-c3.azuredatabricks.netTip-run{\"cluster_name\":\"Victor Bonnet's Cluster\",\"spa...1027-081006-5cgi5kuh
30f53885013194fdab9bc8ad6383e7bdf4106912166953874FINISHEDdbfs:/databricks/mlflow-tracking/4106912166953...2023-11-22 16:37:23.545000+00:002023-11-22 16:37:53.810000+00:001.5817765This message class grpc_shaded.com.databricks....170067107413736075798609407184106912166953874None[{\"artifact_path\":\"GBT-Regressor\",\"flavors\":{\"...4723686315041778941_6064693358274921191_17ebf6...NOTEBOOKhttps://eastus-c3.azuredatabricks.netTip-run{\"cluster_name\":\"Victor Bonnet's Cluster\",\"spa...1027-081006-5cgi5kuh
415ba91e309d548c2a85c1f01c006e39b4106912166953874FINISHEDdbfs:/databricks/mlflow-tracking/4106912166953...2023-11-22 16:35:48.939000+00:002023-11-22 16:36:25.395000+00:001.4985925This message class grpc_shaded.com.databricks....170067098571936075798609407184106912166953874None[{\"artifact_path\":\"GBT-Regressor\",\"flavors\":{\"...4723686315041778941_8549608798050021702_4078e9...NOTEBOOKhttps://eastus-c3.azuredatabricks.netTip-run{\"cluster_name\":\"Victor Bonnet's Cluster\",\"spa...1027-081006-5cgi5kuh
\n", "
" ] }, "metadata": { "application/vnd.databricks.v1+output": { "addedWidgets": {}, "arguments": {}, "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
run_idexperiment_idstatusartifact_uristart_timeend_timemetrics.rmseparams.maxItertags.mlflow.databricks.cluster.libraries.errortags.mlflow.databricks.notebookRevisionIDtags.mlflow.databricks.workspaceIDtags.mlflow.databricks.notebookIDtags.sparkDatasourceInfotags.mlflow.log-model.historytags.mlflow.databricks.notebook.commandIDtags.mlflow.source.typetags.mlflow.databricks.webappURLtags.mlflow.runNametags.mlflow.databricks.cluster.infotags.mlflow.databricks.cluster.id
03b461072c96e4ce492250c86e5d4b04a4106912166953874FINISHEDdbfs:/databricks/mlflow-tracking/4106912166953...2023-11-22 16:47:43.155000+00:002023-11-22 16:48:13.660000+00:001.6857125This message class grpc_shaded.com.databricks....170067169406536075798609407184106912166953874path=mlflowdbfs:/artifacts?run_id=05c39cdd803d...[{\"artifact_path\":\"GBT-Regressor\",\"flavors\":{\"...4723686315041778941_5042614833210616149_6f6f9d...NOTEBOOKhttps://eastus-c3.azuredatabricks.netTip-run{\"cluster_name\":\"Victor Bonnet's Cluster\",\"spa...1027-081006-5cgi5kuh
105c39cdd803d41aaaf1d95e938bccb3c4106912166953874FINISHEDdbfs:/databricks/mlflow-tracking/4106912166953...2023-11-22 16:43:02.360000+00:002023-11-22 16:43:32.223000+00:001.0712175This message class grpc_shaded.com.databricks....170067141255536075798609407184106912166953874None[{\"artifact_path\":\"GBT-Regressor\",\"flavors\":{\"...4723686315041778941_8003943757913343583_2f0694...NOTEBOOKhttps://eastus-c3.azuredatabricks.netTip-run{\"cluster_name\":\"Victor Bonnet's Cluster\",\"spa...1027-081006-5cgi5kuh
2197b10148baf4c55a9cfd55cb716a45f4106912166953874FINISHEDdbfs:/databricks/mlflow-tracking/4106912166953...2023-11-22 16:42:20.492000+00:002023-11-22 16:42:50.008000+00:001.5685735This message class grpc_shaded.com.databricks....170067137033036075798609407184106912166953874None[{\"artifact_path\":\"GBT-Regressor\",\"flavors\":{\"...4723686315041778941_8672488673836230069_a44c4b...NOTEBOOKhttps://eastus-c3.azuredatabricks.netTip-run{\"cluster_name\":\"Victor Bonnet's Cluster\",\"spa...1027-081006-5cgi5kuh
30f53885013194fdab9bc8ad6383e7bdf4106912166953874FINISHEDdbfs:/databricks/mlflow-tracking/4106912166953...2023-11-22 16:37:23.545000+00:002023-11-22 16:37:53.810000+00:001.5817765This message class grpc_shaded.com.databricks....170067107413736075798609407184106912166953874None[{\"artifact_path\":\"GBT-Regressor\",\"flavors\":{\"...4723686315041778941_6064693358274921191_17ebf6...NOTEBOOKhttps://eastus-c3.azuredatabricks.netTip-run{\"cluster_name\":\"Victor Bonnet's Cluster\",\"spa...1027-081006-5cgi5kuh
415ba91e309d548c2a85c1f01c006e39b4106912166953874FINISHEDdbfs:/databricks/mlflow-tracking/4106912166953...2023-11-22 16:35:48.939000+00:002023-11-22 16:36:25.395000+00:001.4985925This message class grpc_shaded.com.databricks....170067098571936075798609407184106912166953874None[{\"artifact_path\":\"GBT-Regressor\",\"flavors\":{\"...4723686315041778941_8549608798050021702_4078e9...NOTEBOOKhttps://eastus-c3.azuredatabricks.netTip-run{\"cluster_name\":\"Victor Bonnet's Cluster\",\"spa...1027-081006-5cgi5kuh
\n
", "datasetInfos": [], "metadata": {}, "removedWidgets": [], "textData": null, "type": "htmlSandbox" } }, "output_type": "display_data" } ], "source": [ "mlflow.search_runs().drop(['tags.mlflow.databricks.workspaceURL',\n", " 'tags.mlflow.databricks.notebookPath',\n", " 'tags.mlflow.source.name',\n", " 'tags.mlflow.user'], axis=1)" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "f04a8cf6-a501-4e11-a7af-66b9b9bd6744", "showTitle": false, "title": "" } }, "source": [ "

Using Pandas syntax information can be filtered on what is needed:

" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "29d945c5-a93c-4f84-a01b-341d71e9f980", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tags.mlflow.runNamerun_idparams.maxItermetrics.rmse
1Tip-run05c39cdd803d41aaaf1d95e938bccb3c51.071217
4Tip-run15ba91e309d548c2a85c1f01c006e39b51.498592
2Tip-run197b10148baf4c55a9cfd55cb716a45f51.568573
3Tip-run0f53885013194fdab9bc8ad6383e7bdf51.581776
0Tip-run3b461072c96e4ce492250c86e5d4b04a51.685712
\n", "
" ] }, "metadata": { "application/vnd.databricks.v1+output": { "addedWidgets": {}, "arguments": {}, "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
tags.mlflow.runNamerun_idparams.maxItermetrics.rmse
1Tip-run05c39cdd803d41aaaf1d95e938bccb3c51.071217
4Tip-run15ba91e309d548c2a85c1f01c006e39b51.498592
2Tip-run197b10148baf4c55a9cfd55cb716a45f51.568573
3Tip-run0f53885013194fdab9bc8ad6383e7bdf51.581776
0Tip-run3b461072c96e4ce492250c86e5d4b04a51.685712
\n
", "datasetInfos": [], "metadata": {}, "removedWidgets": [], "textData": null, "type": "htmlSandbox" } }, "output_type": "display_data" } ], "source": [ "mlflow.search_runs()[[\"tags.mlflow.runName\", \"run_id\", \"params.maxIter\", \"metrics.rmse\"]].sort_values(by=['metrics.rmse'], ascending=True)" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "3ea640ea-2ab2-46f6-b53f-a440ef888681", "showTitle": false, "title": "" } }, "source": [ "

A SQL filter can also be applied directly in the mlflow.search_run() function by using its filter_string parameter. This is particularly useful when there are many runs:

" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "9844f927-34d9-4ffc-a1f7-c8c17bafc6bb", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tags.mlflow.runNamerun_idparams.maxItermetrics.rmse
0Tip-run05c39cdd803d41aaaf1d95e938bccb3c51.071217
1Tip-run15ba91e309d548c2a85c1f01c006e39b51.498592
\n", "
" ] }, "metadata": { "application/vnd.databricks.v1+output": { "addedWidgets": {}, "arguments": {}, "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
tags.mlflow.runNamerun_idparams.maxItermetrics.rmse
0Tip-run05c39cdd803d41aaaf1d95e938bccb3c51.071217
1Tip-run15ba91e309d548c2a85c1f01c006e39b51.498592
\n
", "datasetInfos": [], "metadata": {}, "removedWidgets": [], "textData": null, "type": "htmlSandbox" } }, "output_type": "display_data" } ], "source": [ "mlflow.search_runs(filter_string=\"tags.mlflow.runName like '%Tip%' and metrics.rmse<=1.5\")[[\"tags.mlflow.runName\", \"run_id\", \"params.maxIter\", \"metrics.rmse\"]]" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "de6aca3d-c7f8-4d16-881b-df28551dc63e", "showTitle": false, "title": "" } }, "source": [ "

With this, let's load the best model:

" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "d37c9d5a-6eb0-4eee-891f-d547ff1b08b8", "showTitle": false, "title": "" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Best model path is: runs:/05c39cdd803d41aaaf1d95e938bccb3c/GBT-Regressor\n" ] } ], "source": [ "bestModelRunId = mlflow.search_runs().sort_values(by=['metrics.rmse'], ascending=True).head(1)[\"run_id\"].values[0]\n", "#\n", "best_model_path = f\"runs:/{bestModelRunId}/{model_name}\"\n", "print(f\"Best model path is: {best_model_path}\")\n", "#\n", "loaded_model = mlflow.spark.load_model(best_model_path)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "75cd5545-72e8-4678-b590-48a52a6e19c7", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "text/html": [ "
tipprediction
1.322.2600362250689874
1.562.2600362250689874
1.572.270591780624543
2.02.4706720284816104
1.012.621464796497559
3.03.2463005089207058
2.53.1992041196791403
3.312.9060591599135535
3.124.0487121332066645
7.584.6646880493827165
1.01.3015395831199872
1.81.9777634977962606
2.01.9777634977962606
1.682.270591780624543
2.092.564973300067474
2.472.6026518714960454
4.063.227487583919192
4.04.361694589718562
4.734.687208975308642
2.56.218021382716049
1.51.5428140028467654
1.611.5533466844466506
2.01.5533466844466506
3.553.456976
2.01.5428140028467654
2.21.9777634977962606
2.011.9777634977962606
2.231.9882961793961458
2.01.9777634977962606
1.582.270591780624543
3.482.270591780624543
2.02.270591780624543
2.022.270591780624543
3.02.564973300067474
1.52.270591780624543
2.53.192036225068987
4.193.131679615987459
5.05.054009020863314
5.164.408080999391555
6.55.169339707135286
2.03.0488064
2.03.427246104519053
1.171.6696819697747305
5.04.990542308641975
" ] }, "metadata": { "application/vnd.databricks.v1+output": { "addedWidgets": {}, "aggData": [], "aggError": "", "aggOverflow": false, "aggSchema": [], "aggSeriesLimitReached": false, "aggType": "", "arguments": {}, "columnCustomDisplayInfos": {}, "data": [ [ 1.32, 2.2600362250689874 ], [ 1.56, 2.2600362250689874 ], [ 1.57, 2.270591780624543 ], [ 2, 2.4706720284816104 ], [ 1.01, 2.621464796497559 ], [ 3, 3.2463005089207058 ], [ 2.5, 3.1992041196791403 ], [ 3.31, 2.9060591599135535 ], [ 3.12, 4.0487121332066645 ], [ 7.58, 4.6646880493827165 ], [ 1, 1.3015395831199872 ], [ 1.8, 1.9777634977962606 ], [ 2, 1.9777634977962606 ], [ 1.68, 2.270591780624543 ], [ 2.09, 2.564973300067474 ], [ 2.47, 2.6026518714960454 ], [ 4.06, 3.227487583919192 ], [ 4, 4.361694589718562 ], [ 4.73, 4.687208975308642 ], [ 2.5, 6.218021382716049 ], [ 1.5, 1.5428140028467654 ], [ 1.61, 1.5533466844466506 ], [ 2, 1.5533466844466506 ], [ 3.55, 3.456976 ], [ 2, 1.5428140028467654 ], [ 2.2, 1.9777634977962606 ], [ 2.01, 1.9777634977962606 ], [ 2.23, 1.9882961793961458 ], [ 2, 1.9777634977962606 ], [ 1.58, 2.270591780624543 ], [ 3.48, 2.270591780624543 ], [ 2, 2.270591780624543 ], [ 2.02, 2.270591780624543 ], [ 3, 2.564973300067474 ], [ 1.5, 2.270591780624543 ], [ 2.5, 3.192036225068987 ], [ 4.19, 3.131679615987459 ], [ 5, 5.054009020863314 ], [ 5.16, 4.408080999391555 ], [ 6.5, 5.169339707135286 ], [ 2, 3.0488064 ], [ 2, 3.427246104519053 ], [ 1.17, 1.6696819697747305 ], [ 5, 4.990542308641975 ] ], "datasetInfos": [], "dbfsResultPath": null, "isJsonSchema": true, "metadata": {}, "overflow": false, "plotOptions": { "customPlotOptions": {}, "displayType": "table", "pivotAggregation": null, "pivotColumns": null, "xColumns": null, "yColumns": null }, "removedWidgets": [], "schema": [ { "metadata": "{}", "name": "tip", "type": "\"double\"" }, { "metadata": "{\"ml_attr\":{\"attrs\":{\"numeric\":[{\"idx\":8,\"name\":\"total_bill\"},{\"idx\":9,\"name\":\"sex\"},{\"idx\":10,\"name\":\"smoker\"},{\"idx\":11,\"name\":\"time\"}],\"binary\":[{\"idx\":0,\"name\":\"size_ohe_2\"},{\"idx\":1,\"name\":\"size_ohe_3\"},{\"idx\":2,\"name\":\"size_ohe_4\"},{\"idx\":3,\"name\":\"size_ohe_5\"},{\"idx\":4,\"name\":\"size_ohe_6\"},{\"idx\":5,\"name\":\"day_ohe_Sat\"},{\"idx\":6,\"name\":\"day_ohe_Sun\"},{\"idx\":7,\"name\":\"day_ohe_Thur\"}]},\"num_attrs\":12}}", "name": "prediction", "type": "\"double\"" } ], "type": "table" } }, "output_type": "display_data" } ], "source": [ "display(loaded_model.transform(test_df).select(\"tip\", \"prediction\"))" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "67f74bab-3bc1-46ec-a68e-de198cc7b1c7", "showTitle": false, "title": "" } }, "source": [ "" ] } ], "metadata": { "application/vnd.databricks.v1+notebook": { "dashboards": [], "language": "python", "notebookMetadata": { "mostRecentlyExecutedCommandWithImplicitDF": { "commandId": 121806328486209, "dataframes": [ "_sqldf" ] }, "pythonIndentUnit": 2 }, "notebookName": "Databricks-ML-professional-S01b-Experiment-Tracking", "widgets": {} }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 4 }