{ "cells": [ { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "58fab4bb-231e-48cf-8ed4-fc15a1b22845", "showTitle": false, "title": "" } }, "source": [ "

Databricks-ML-professional-S01c-Advanced-Experiment-Tracking

" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "8710b4bb-7fe6-484e-86a9-ffd51a7d4d7a", "showTitle": false, "title": "" } }, "source": [ "
\n", "
\n", "

This Notebook adds information related to the following requirements:


\n", "Advanced Experiment Tracking:\n", "\n", "
\n", "

Download this notebook at format ipynb here.

\n", "
\n", "
" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "b5f6d0da-1d81-4fa0-9770-a9e4d6863534", "showTitle": false, "title": "" } }, "source": [ "
\n", "1. Import libraries
" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "8a2d2e59-7426-4d5f-8d97-3dcff6e5151d", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "application/vnd.databricks.v1+bamboolib_hint": "{\"pd.DataFrames\": [], \"version\": \"0.0.1\"}", "text/plain": [] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "import seaborn as sns\n", "#\n", "from pyspark.sql.functions import *\n", "#\n", "from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler\n", "from pyspark.ml.regression import GeneralizedLinearRegression, FMRegressor, LinearRegression\n", "from pyspark.ml.evaluation import RegressionEvaluator\n", "from pyspark.ml import Pipeline\n", "#\n", "import mlflow\n", "#\n", "from hyperopt import hp, fmin, tpe, Trials, STATUS_OK\n", "#\n", "import logging" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "85c05c1b-015d-405a-b6be-f8484a985d96", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "logging.getLogger(\"mlflow\").setLevel(logging.FATAL)" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "aa08db2c-a856-4c86-81fe-9a8b7322cd6a", "showTitle": false, "title": "" } }, "source": [ "
\n", "2. Load dataset, convert to Spark DataFrame
" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "5b64ff08-1603-4d0c-bc4e-19c0094c3b9c", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "text/html": [ "
total_billtipsexsmokerdaytimesize
16.991.01FemaleNoSunDinner2
10.341.66MaleNoSunDinner3
21.013.5MaleNoSunDinner3
23.683.31MaleNoSunDinner2
24.593.61FemaleNoSunDinner4
" ] }, "metadata": { "application/vnd.databricks.v1+output": { "addedWidgets": {}, "aggData": [], "aggError": "", "aggOverflow": false, "aggSchema": [], "aggSeriesLimitReached": false, "aggType": "", "arguments": {}, "columnCustomDisplayInfos": {}, "data": [ [ 16.99, 1.01, "Female", "No", "Sun", "Dinner", 2 ], [ 10.34, 1.66, "Male", "No", "Sun", "Dinner", 3 ], [ 21.01, 3.5, "Male", "No", "Sun", "Dinner", 3 ], [ 23.68, 3.31, "Male", "No", "Sun", "Dinner", 2 ], [ 24.59, 3.61, "Female", "No", "Sun", "Dinner", 4 ] ], "datasetInfos": [], "dbfsResultPath": null, "isJsonSchema": true, "metadata": {}, "overflow": false, "plotOptions": { "customPlotOptions": {}, "displayType": "table", "pivotAggregation": null, "pivotColumns": null, "xColumns": null, "yColumns": null }, "removedWidgets": [], "schema": [ { "metadata": "{}", "name": "total_bill", "type": "\"double\"" }, { "metadata": "{}", "name": "tip", "type": "\"double\"" }, { "metadata": "{}", "name": "sex", "type": "\"string\"" }, { "metadata": "{}", "name": "smoker", "type": "\"string\"" }, { "metadata": "{}", "name": "day", "type": "\"string\"" }, { "metadata": "{}", "name": "time", "type": "\"string\"" }, { "metadata": "{}", "name": "size", "type": "\"long\"" } ], "type": "table" } }, "output_type": "display_data" } ], "source": [ "tips_df = sns.load_dataset(\"tips\")\n", "#\n", "tips_sdf = spark.createDataFrame(tips_df)\n", "#\n", "display(tips_sdf.limit(5))" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "2b595b34-0633-4f66-9ca0-6067f4cc0716", "showTitle": false, "title": "" } }, "source": [ "
\n", "3. Prepare data
" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "453316e6-0dc3-41b0-9730-27c39ed9bdf1", "showTitle": false, "title": "" } }, "source": [ "

Some transformations are done to prepare dataset to be used to train a ML model.

\n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", " \n", "\n", "
column namecomment
tiptarget to predict. Contains numeric
total_billnumeric column to keep as is
sexContains Female and Male converted to 0 and 1
smokerContains yes and no converted to 0 and 1
timeContains Dinner and Lunch converted to 0 and 1
daycategorical column to One Hot Encode
sizecategorical column to One Hot Encode
" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "92c6fbbf-0a08-4fee-8ad7-abdf5a0f9ea4", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "tips_sdf = tips_sdf.selectExpr(\"total_bill\",\n", " \"tip\",\n", " \"case when sex = 'Female' then 1 else 0 end as sex\",\n", " \"case when smoker = 'yes' then 1 else 0 end as smoker\",\n", " \"case when time = 'Dinner' then 1 else 0 end as time\",\n", " \"day\",\n", " \"size\")\n", "#\n", "train_df, test_df = tips_sdf.randomSplit([.8, .2], seed=42)\n", "#\n", "ohe_cols = [\"size\", \"day\"]\n", "num_cols = [\"total_bill\", \"sex\", \"smoker\", \"time\"]\n", "target_col = \"tip\"\n", "#\n", "string_indexer = StringIndexer(inputCols=ohe_cols, outputCols=[c+\"_index\" for c in ohe_cols], handleInvalid=\"skip\")\n", "#\n", "ohe = OneHotEncoder()\n", "ohe.setInputCols([c+\"_index\" for c in ohe_cols])\n", "ohe.setOutputCols([c+\"_ohe\" for c in ohe_cols])\n", "#\n", "assembler_inputs = [c+\"_ohe\" for c in ohe_cols] + num_cols\n", "vec_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol=\"features\")" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "910af898-da90-4e26-a856-cdb4b902e101", "showTitle": false, "title": "" } }, "source": [ "
\n", "4. Evaluator and model
" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "06212c8c-e7bf-45e7-827f-fd3fcad64486", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "glr = GeneralizedLinearRegression(featuresCol=\"features\", labelCol=target_col, maxIter=10)\n", "lrm = LinearRegression(featuresCol=\"features\", labelCol=target_col)\n", "fmr = FMRegressor(featuresCol=\"features\", labelCol=target_col, stepSize=0.001)\n", "evaluator = RegressionEvaluator(labelCol=target_col, predictionCol=\"prediction\", metricName=\"rmse\")" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "cd1fc5a1-c77d-45e4-88b2-d2861900b3e5", "showTitle": false, "title": "" } }, "source": [ "\n", "
\n", "5. Perform MLflow experiment tracking workflows using model signatures and input examples
" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "6af34c76-f8fd-40a6-a62a-7dd1a94e88de", "showTitle": false, "title": "" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "inputs: \n", " ['total_bill': double, 'tip': double, 'sex': integer, 'smoker': integer, 'time': integer, 'day': string, 'size': long]\n", "outputs: \n", " ['tip': double]\n", "\n" ] } ], "source": [ "signature = mlflow.models.infer_signature(train_df, train_df[[\"tip\"]]);\n", "print(signature)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "40df8662-ab45-49a0-8069-8943078270eb", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
total_billtipsexsmokertimedaysize
08.772.00001Sun2
19.551.45001Sat2
29.941.56001Sun2
310.271.71001Sun2
410.292.60101Sun2
\n", "
" ] }, "metadata": { "application/vnd.databricks.v1+output": { "addedWidgets": {}, "arguments": {}, "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
total_billtipsexsmokertimedaysize
08.772.00001Sun2
19.551.45001Sat2
29.941.56001Sun2
310.271.71001Sun2
410.292.60101Sun2
\n
", "datasetInfos": [], "metadata": {}, "removedWidgets": [], "textData": null, "type": "htmlSandbox" } }, "output_type": "display_data" } ], "source": [ "input_example = train_df.toPandas().head()\n", "input_example" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "c0a1d573-d054-48bb-864a-fb9eab2efaa3", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "model_name = \"linear-regression\"\n", "#\n", "with mlflow.start_run(run_name=\"Tip-run\") as run:\n", " #\n", " # define pipeline stages according to model\n", " stages = [string_indexer, ohe, vec_assembler, lrm]\n", " #\n", " # set pipeline\n", " pipeline = Pipeline(stages=stages)\n", " #\n", " # fit pipeline to train set\n", " model = pipeline.fit(train_df)\n", " #\n", " # manually log model to mlflow\n", " mlflow.spark.log_model(model, model_name, signature=signature, input_example=input_example)\n", " #\n", " # manually log parameter to mlflow\n", " mlflow.log_param(\"maxIter\", 11)\n", " #\n", " # predict test set\n", " pred_df = model.transform(test_df)\n", " #\n", " # evaluate prediction\n", " rmse = evaluator.evaluate(pred_df)\n", " #\n", " # manually log metric to mlflow\n", " mlflow.log_metric(\"rmse\", rmse)" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "b7c1ad7c-c381-4758-bb59-5114ba6f0ba3", "showTitle": false, "title": "" } }, "source": [ "\n", "
\n", "6. Identify the requirements for tracking nested runs
" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "89edcb39-0b90-44ca-b6fd-5af69c3115a3", "showTitle": false, "title": "" } }, "source": [ "

It is possible to log to mlflow using nested runs:

\n", "" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "e5839d28-4117-400d-9a8c-d7fa5fbd0665", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "with mlflow.start_run(run_name=\"tips_evaluation\") as run_parent:\n", " #\n", " # loop on the three regression models\n", " for regression_model in [glr, lrm, fmr]:\n", " #\n", " # get model name\n", " model_name = regression_model.__str__().split(\"_\")[0]\n", " #\n", " # Nest mlflow logging\n", " with mlflow.start_run(run_name=model_name, nested=True) as run:\n", " #\n", " # define pipeline stages according to model\n", " stages = [string_indexer, ohe, vec_assembler, regression_model]\n", " #\n", " # set pipeline\n", " pipeline = Pipeline(stages=stages)\n", " #\n", " # fit pipeline to train set\n", " model = pipeline.fit(train_df)\n", " #\n", " # log model to mlflow\n", " mlflow.spark.log_model(model, model_name, signature=signature, input_example=input_example)\n", " #\n", " # predict test set\n", " pred_df = model.transform(test_df)\n", " #\n", " # evaluate prediction\n", " rmse = evaluator.evaluate(pred_df)\n", " #\n", " # log evaluation to mlflow\n", " mlflow.log_metric(\"rmse\", rmse)" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "f04a8cf6-a501-4e11-a7af-66b9b9bd6744", "showTitle": false, "title": "" } }, "source": [ "" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "29d945c5-a93c-4f84-a01b-341d71e9f980", "showTitle": false, "title": "" } }, "source": [ "\n", "
\n", "7. Describe the process of enabling autologging, including with the use of Hyperopt
" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "3ea640ea-2ab2-46f6-b53f-a440ef888681", "showTitle": false, "title": "" } }, "source": [ "
Here we enable mlflow logging with autolog() and train a simple model. This will automatically log everything possible for each library used.
" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "3b75ef74-0a1a-4740-8dc1-567388562b72", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "mlflow.autolog()" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "c28cef57-eb3d-40d6-91f0-24b5ea00505f", "showTitle": false, "title": "" } }, "source": [ "

Now let's fit and evaluate a model:

" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "2e9b4b6e-18be-4c01-ac94-ddfb7263b97b", "showTitle": false, "title": "" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Out[12]: 1.3054572176798678" ] } ], "source": [ "# fit pipeline to train set\n", "model_lrm_autolog = pipeline.fit(train_df)\n", "#\n", "# predict test set\n", "pred_df = model_lrm_autolog.transform(test_df)\n", "#\n", "# evaluate\n", "evaluator.evaluate(pred_df)" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "431483d6-48ce-4392-87c2-95dabcfd87c8", "showTitle": false, "title": "" } }, "source": [ "

After that, in MLflow UI, we can see the many parameters that have been logged.

\n", "

Alternatively, we can get and see the logged parameters for latest run programmaticaly:

" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "eb984d77-72af-4a8b-8ae7-14188488963a", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "text/html": [ "
run_idexperiment_idstatusartifact_uristart_timeend_timemetrics.rmse_test_dfmetrics.rmsemetrics.rmse_unknown_datasetparams.FMRegressor.fitInterceptparams.FMRegressor.maxIterparams.OneHotEncoder.outputColparams.OneHotEncoder.outputColsparams.stagesparams.StringIndexer.stringOrderTypeparams.StringIndexer.inputColsparams.FMRegressor.tolparams.StringIndexer.outputColsparams.StringIndexer.handleInvalidparams.FMRegressor.solverparams.FMRegressor.factorSizeparams.OneHotEncoder.handleInvalidparams.OneHotEncoder.inputColsparams.StringIndexer.outputColparams.FMRegressor.fitLinearparams.FMRegressor.miniBatchFractionparams.VectorAssembler.handleInvalidparams.VectorAssembler.inputColsparams.FMRegressor.predictionColparams.FMRegressor.regParamparams.FMRegressor.labelColparams.FMRegressor.featuresColparams.FMRegressor.initStdparams.FMRegressor.stepSizeparams.OneHotEncoder.dropLastparams.FMRegressor.seedparams.VectorAssembler.outputColparams.maxIterparams.LinearRegression.maxIterparams.LinearRegression.standardizationparams.LinearRegression.tolparams.LinearRegression.solverparams.LinearRegression.elasticNetParamparams.LinearRegression.maxBlockSizeInMBparams.LinearRegression.featuresColparams.LinearRegression.labelColparams.LinearRegression.fitInterceptparams.LinearRegression.aggregationDepthparams.LinearRegression.lossparams.LinearRegression.predictionColparams.LinearRegression.epsilonparams.LinearRegression.regParamtags.mlflow.databricks.cluster.idtags.mlflow.databricks.cluster.libraries.errortags.mlflow.databricks.notebookRevisionIDtags.mlflow.databricks.workspaceIDtags.mlflow.databricks.notebook.commandIDtags.mlflow.source.typetags.mlflow.databricks.webappURLtags.mlflow.runNametags.estimator_classtags.mlflow.autologgingtags.mlflow.databricks.notebookIDtags.estimator_nametags.mlflow.parentRunIdtags.mlflow.rootRunIdtags.mlflow.log-model.history
08b972964438470595f2ba9ba6aa9d403541968995997190FINISHEDdbfs:/databricks/mlflow-tracking/3541968995997190/08b972964438470595f2ba9ba6aa9d40/artifacts2023-11-22T16:55:45.974+00002023-11-22T16:55:56.280+00001.3054572176798678nullnullTrue100OneHotEncoder_c95dbc53b9cc__output['size_ohe', 'day_ohe']['StringIndexer', 'OneHotEncoder', 'VectorAssembler', 'FMRegressor']frequencyDesc['size', 'day']1e-06['size_index', 'day_index']skipadamW8error['size_index', 'day_index']StringIndexer_c3caebc64717__outputTrue1.0error['size_ohe', 'day_ohe', 'total_bill', 'sex', 'smoker', 'time']prediction0.0tipfeatures0.010.001True-2921654334123211668featuresnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull1027-081006-5cgi5kuhThis message class grpc_shaded.com.databricks.api.proto.managedLibraries.ClusterStatus DID NOT match any methods in the stub class grpc_shaded.com.databricks.api.proto.cluster.ClusterServiceGrpc$ClusterServiceBlockingStub170067215661136075798609407187308506017976005609_4719503818729863905_49d8e6f9a405484da1266fc91cafd976NOTEBOOKhttps://eastus-c3.azuredatabricks.netcolorful-snake-723pyspark.ml.pipeline.Pipelinepyspark.ml3541968995997190Pipelinenullnullnull
" ] }, "metadata": { "application/vnd.databricks.v1+output": { "addedWidgets": {}, "aggData": [], "aggError": "", "aggOverflow": false, "aggSchema": [], "aggSeriesLimitReached": false, "aggType": "", "arguments": {}, "columnCustomDisplayInfos": {}, "data": [ [ "08b972964438470595f2ba9ba6aa9d40", "3541968995997190", "FINISHED", "dbfs:/databricks/mlflow-tracking/3541968995997190/08b972964438470595f2ba9ba6aa9d40/artifacts", "2023-11-22T16:55:45.974+0000", "2023-11-22T16:55:56.280+0000", 1.3054572176798678, null, null, "True", "100", "OneHotEncoder_c95dbc53b9cc__output", "['size_ohe', 'day_ohe']", "['StringIndexer', 'OneHotEncoder', 'VectorAssembler', 'FMRegressor']", "frequencyDesc", "['size', 'day']", "1e-06", "['size_index', 'day_index']", "skip", "adamW", "8", "error", "['size_index', 'day_index']", "StringIndexer_c3caebc64717__output", "True", "1.0", "error", "['size_ohe', 'day_ohe', 'total_bill', 'sex', 'smoker', 'time']", "prediction", "0.0", "tip", "features", "0.01", "0.001", "True", "-2921654334123211668", "features", null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, "1027-081006-5cgi5kuh", "This message class grpc_shaded.com.databricks.api.proto.managedLibraries.ClusterStatus DID NOT match any methods in the stub class grpc_shaded.com.databricks.api.proto.cluster.ClusterServiceGrpc$ClusterServiceBlockingStub", "1700672156611", "3607579860940718", "7308506017976005609_4719503818729863905_49d8e6f9a405484da1266fc91cafd976", "NOTEBOOK", "https://eastus-c3.azuredatabricks.net", "colorful-snake-723", "pyspark.ml.pipeline.Pipeline", "pyspark.ml", "3541968995997190", "Pipeline", null, null, null ] ], "datasetInfos": [], "dbfsResultPath": null, "isJsonSchema": true, "metadata": {}, "overflow": false, "plotOptions": { "customPlotOptions": {}, "displayType": "table", "pivotAggregation": null, "pivotColumns": null, "xColumns": null, "yColumns": null }, "removedWidgets": [], "schema": [ { "metadata": "{}", "name": "run_id", "type": "\"string\"" }, { "metadata": "{}", "name": "experiment_id", "type": "\"string\"" }, { "metadata": "{}", "name": "status", "type": "\"string\"" }, { "metadata": "{}", "name": "artifact_uri", "type": "\"string\"" }, { "metadata": "{}", "name": "start_time", "type": "\"timestamp\"" }, { "metadata": "{}", "name": "end_time", "type": "\"timestamp\"" }, { "metadata": "{}", "name": "metrics.rmse_test_df", "type": "\"double\"" }, { "metadata": "{}", "name": "metrics.rmse", "type": "\"double\"" }, { "metadata": "{}", "name": "metrics.rmse_unknown_dataset", "type": "\"double\"" }, { "metadata": "{}", "name": "params.FMRegressor.fitIntercept", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.maxIter", "type": "\"string\"" }, { "metadata": "{}", "name": "params.OneHotEncoder.outputCol", "type": "\"string\"" }, { "metadata": "{}", "name": "params.OneHotEncoder.outputCols", "type": "\"string\"" }, { "metadata": "{}", "name": "params.stages", "type": "\"string\"" }, { "metadata": "{}", "name": "params.StringIndexer.stringOrderType", "type": "\"string\"" }, { "metadata": "{}", "name": "params.StringIndexer.inputCols", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.tol", "type": "\"string\"" }, { "metadata": "{}", "name": "params.StringIndexer.outputCols", "type": "\"string\"" }, { "metadata": "{}", "name": "params.StringIndexer.handleInvalid", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.solver", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.factorSize", "type": "\"string\"" }, { "metadata": "{}", "name": "params.OneHotEncoder.handleInvalid", "type": "\"string\"" }, { "metadata": "{}", "name": "params.OneHotEncoder.inputCols", "type": "\"string\"" }, { "metadata": "{}", "name": "params.StringIndexer.outputCol", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.fitLinear", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.miniBatchFraction", "type": "\"string\"" }, { "metadata": "{}", "name": "params.VectorAssembler.handleInvalid", "type": "\"string\"" }, { "metadata": "{}", "name": "params.VectorAssembler.inputCols", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.predictionCol", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.regParam", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.labelCol", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.featuresCol", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.initStd", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.stepSize", "type": "\"string\"" }, { "metadata": "{}", "name": "params.OneHotEncoder.dropLast", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.seed", "type": "\"string\"" }, { "metadata": "{}", "name": "params.VectorAssembler.outputCol", "type": "\"string\"" }, { "metadata": "{}", "name": "params.maxIter", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.maxIter", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.standardization", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.tol", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.solver", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.elasticNetParam", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.maxBlockSizeInMB", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.featuresCol", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.labelCol", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.fitIntercept", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.aggregationDepth", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.loss", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.predictionCol", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.epsilon", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.regParam", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.databricks.cluster.id", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.databricks.cluster.libraries.error", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.databricks.notebookRevisionID", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.databricks.workspaceID", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.databricks.notebook.commandID", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.source.type", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.databricks.webappURL", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.runName", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.estimator_class", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.autologging", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.databricks.notebookID", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.estimator_name", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.parentRunId", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.rootRunId", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.log-model.history", "type": "\"string\"" } ], "type": "table" } }, "output_type": "display_data" } ], "source": [ "output_mlflow = (spark.createDataFrame(mlflow.search_runs())\n", " .drop(*['tags.mlflow.source.name',\n", " 'tags.mlflow.databricks.notebookPath',\n", " 'tags.mlflow.user',\n", " 'tags.mlflow.databricks.workspaceURL',\n", " 'tags.mlflow.databricks.cluster.info']))\n", "display(output_mlflow.orderBy(desc(\"end_time\")).limit(1))" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "54efcfb9-459f-4682-af3a-71d7b9f188bb", "showTitle": false, "title": "" } }, "source": [ "
Let's here use HyperOpt for hyperparameter tuning and use autolog() to log everything.
\n", "

HyperOpt:

\n", "" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "720a634b-bc15-41e4-ad84-93ff25a01bdf", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "def train_model(maxIter, regParam, elasticNetParam, labelCol):\n", " \"\"\"\n", " This train() function:\n", " - takes hyperparameters as inputs (for tuning later)\n", " - returns the rmse score on the test dataset\n", " \"\"\"\n", " # Use MLflow to track training.\n", " # Specify \"nested=True\" since this single model will be logged as a child run of Hyperopt's run.\n", " with mlflow.start_run(nested=True):\n", " #\n", " model_hyperopt = LinearRegression(maxIter=maxIter,\n", " regParam=regParam,\n", " elasticNetParam=elasticNetParam,\n", " labelCol=target_col)\n", " #\n", " evaluator_hyperopt = RegressionEvaluator(labelCol=target_col, predictionCol=\"prediction\")\n", " #\n", " stages = [string_indexer, ohe, vec_assembler, model_hyperopt]\n", " #\n", " # set pipeline\n", " pipeline = Pipeline(stages=stages)\n", " #\n", " # fit pipeline to train set\n", " model_rfr_hyperopt = pipeline.fit(train_df)\n", " #\n", " # predict test set\n", " pred_df = model_rfr_hyperopt.transform(test_df)\n", " #\n", " # evaluate\n", " rmse = evaluator_hyperopt.evaluate(pred_df)\n", " #\n", " # log rmse for each child run\n", " mlflow.log_metric(\"rmse\", rmse)\n", " #\n", " return model_rfr_hyperopt, rmse" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "9c6604cb-3318-4341-93ba-465ea966a13c", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "def objective(params):\n", " \"\"\" This function is the function to minimize by hyperopt \"\"\"\n", " #\n", " model, rmse = train_model(maxIter=params[\"maxIter\"],\n", " regParam=params[\"regParam\"],\n", " elasticNetParam=params[\"elasticNetParam\"],\n", " labelCol=target_col)\n", " #\n", " return {'loss': rmse, 'status': STATUS_OK}" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "116f2066-932c-4268-98e0-2fc9ff77e6dd", "showTitle": false, "title": "" } }, "source": [ "

Let's define the hyperparameter search spaces:

" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "8edfe198-1d5f-46fc-af5e-e9cd10bcc14f", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "search_spaces = {\"maxIter\": hp.quniform(\"maxIter\", 1, 100, 1),\n", " \"regParam\": hp.uniform(\"regParam\", 0.1, 10),\n", " \"elasticNetParam\": hp.uniform(\"elasticNetParam\", 0, 1)}" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "e1a9966d-c2da-46bb-a251-06213916ec80", "showTitle": false, "title": "" } }, "source": [ "

Finally let's run the hyperparameter tuning with HyperOpt:

\n", "

As we are using a model from MLlib, we are going to use Trials class as value for trials parameter of the fmin function.

" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "1a5c898f-cb76-4e8b-ab32-8695efd97c23", "showTitle": false, "title": "" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\r", " 0%| | 0/15 [00:00See also this page or this video to learn more on HyperOpt.

" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "3d035eb8-cc74-47d6-aa7d-469b39fcb013", "showTitle": false, "title": "" } }, "source": [ "\n", "
\n", "8. Log and view artifacts like SHAP plots, custom visualizations, feature data, images, and metadata
\n", "

Looks like logging SHAP - SHapley Additive exPlanations - works with scikit-learn. So let's quickly train a model with scikit-learn library. For simplicity, let's keep day and time features out.

" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "7f360077-6a70-442f-9574-c7f2a0429116", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "mlflow.autolog(disable=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "8a18cd76-5efc-4b35-a6cd-b17b4c11586b", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "from sklearn.linear_model import LinearRegression\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.model_selection import train_test_split\n", "#\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "ee848b7e-7b52-45d9-84b2-9a030c20f13b", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "# load dataset previously prepared\n", "pandas_tips = tips_sdf.toPandas()\n", "#\n", "# set features dataset\n", "pandas_tips_features = pandas_tips.drop([\"tip\", \"day\", \"time\"], axis=1)\n", "#\n", "# set target\n", "pandas_tips_target = pandas_tips[\"tip\"]\n", "#\n", "# train test split\n", "pd_df_X_train, pd_df_X_test, pd_df_y_train, pd_df_y_test = train_test_split(pandas_tips_features,\n", " pandas_tips_target,\n", " test_size=0.33,\n", " random_state=42)\n", "#\n", "# fit \n", "fitted_rfr_model = RandomForestRegressor().fit(pd_df_X_train, pd_df_y_train)" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "206ff31c-6ee9-45f2-b47c-56d1023e9556", "showTitle": false, "title": "" } }, "source": [ "

Here is an example of logging SHAP to mlflow:

" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "9574a53e-48ab-429d-9315-a730e3c45bf4", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "56a5641835084ae89614b0a460224285", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/81 [00:00Here is an example of logging figure to mlflow:

" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "5e0431f5-6c8f-4c5a-a639-d60e8c780bbe", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "image/png": "\n" }, "metadata": { "application/vnd.databricks.v1+output": { "addedWidgets": {}, "arguments": {}, "data": "\n", "datasetInfos": [], "metadata": {}, "removedWidgets": [], "type": "image" } }, "output_type": "display_data" } ], "source": [ "with mlflow.start_run(run_name=\"figure_tips\"):\n", " #\n", " # Generate feature importance plot thanks to feature_importances_ attribute of the RandomForestRegressor model\n", " feature_importances = pd.Series(fitted_rfr_model.feature_importances_, index=pd_df_X_train.columns)\n", " fig, ax = plt.subplots()\n", " feature_importances.plot.bar(ax=ax)\n", " ax.set_title(\"Feature importances using MDI\")\n", " ax.set_ylabel(\"Mean decrease in impurity\")\n", " #\n", " # Log figure to mlflow\n", " mlflow.log_figure(fig, \"feature_importance_rf.png\")" ] } ], "metadata": { "application/vnd.databricks.v1+notebook": { "dashboards": [], "language": "python", "notebookMetadata": { "mostRecentlyExecutedCommandWithImplicitDF": { "commandId": 1774797690553258, "dataframes": [ "_sqldf" ] }, "pythonIndentUnit": 2 }, "notebookName": "Databricks-ML-professional-S01c-Advanced-Experiment-Tracking", "widgets": {} }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 4 }