{ "cells": [ { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "58fab4bb-231e-48cf-8ed4-fc15a1b22845", "showTitle": false, "title": "" } }, "source": [ "

Databricks-ML-professional-S01c-Advanced-Experiment-Tracking

" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "8710b4bb-7fe6-484e-86a9-ffd51a7d4d7a", "showTitle": false, "title": "" } }, "source": [ "
\n", "
\n", "

This Notebook adds information related to the following requirements:


\n", "Advanced Experiment Tracking:\n", "\n", "
\n", "

Download this notebook at format ipynb here.

\n", "
\n", "
" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "b5f6d0da-1d81-4fa0-9770-a9e4d6863534", "showTitle": false, "title": "" } }, "source": [ "
\n", "1. Import libraries
" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "8a2d2e59-7426-4d5f-8d97-3dcff6e5151d", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "application/vnd.databricks.v1+bamboolib_hint": "{\"pd.DataFrames\": [], \"version\": \"0.0.1\"}", "text/plain": [] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "import seaborn as sns\n", "#\n", "from pyspark.sql.functions import *\n", "#\n", "from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler\n", "from pyspark.ml.regression import GeneralizedLinearRegression, FMRegressor, LinearRegression\n", "from pyspark.ml.evaluation import RegressionEvaluator\n", "from pyspark.ml import Pipeline\n", "#\n", "import mlflow\n", "#\n", "from hyperopt import hp, fmin, tpe, Trials, STATUS_OK\n", "#\n", "import logging" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "85c05c1b-015d-405a-b6be-f8484a985d96", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "logging.getLogger(\"mlflow\").setLevel(logging.FATAL)" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "aa08db2c-a856-4c86-81fe-9a8b7322cd6a", "showTitle": false, "title": "" } }, "source": [ "
\n", "2. Load dataset, convert to Spark DataFrame
" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "5b64ff08-1603-4d0c-bc4e-19c0094c3b9c", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "text/html": [ "
total_billtipsexsmokerdaytimesize
16.991.01FemaleNoSunDinner2
10.341.66MaleNoSunDinner3
21.013.5MaleNoSunDinner3
23.683.31MaleNoSunDinner2
24.593.61FemaleNoSunDinner4
" ] }, "metadata": { "application/vnd.databricks.v1+output": { "addedWidgets": {}, "aggData": [], "aggError": "", "aggOverflow": false, "aggSchema": [], "aggSeriesLimitReached": false, "aggType": "", "arguments": {}, "columnCustomDisplayInfos": {}, "data": [ [ 16.99, 1.01, "Female", "No", "Sun", "Dinner", 2 ], [ 10.34, 1.66, "Male", "No", "Sun", "Dinner", 3 ], [ 21.01, 3.5, "Male", "No", "Sun", "Dinner", 3 ], [ 23.68, 3.31, "Male", "No", "Sun", "Dinner", 2 ], [ 24.59, 3.61, "Female", "No", "Sun", "Dinner", 4 ] ], "datasetInfos": [], "dbfsResultPath": null, "isJsonSchema": true, "metadata": {}, "overflow": false, "plotOptions": { "customPlotOptions": {}, "displayType": "table", "pivotAggregation": null, "pivotColumns": null, "xColumns": null, "yColumns": null }, "removedWidgets": [], "schema": [ { "metadata": "{}", "name": "total_bill", "type": "\"double\"" }, { "metadata": "{}", "name": "tip", "type": "\"double\"" }, { "metadata": "{}", "name": "sex", "type": "\"string\"" }, { "metadata": "{}", "name": "smoker", "type": "\"string\"" }, { "metadata": "{}", "name": "day", "type": "\"string\"" }, { "metadata": "{}", "name": "time", "type": "\"string\"" }, { "metadata": "{}", "name": "size", "type": "\"long\"" } ], "type": "table" } }, "output_type": "display_data" } ], "source": [ "tips_df = sns.load_dataset(\"tips\")\n", "#\n", "tips_sdf = spark.createDataFrame(tips_df)\n", "#\n", "display(tips_sdf.limit(5))" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "2b595b34-0633-4f66-9ca0-6067f4cc0716", "showTitle": false, "title": "" } }, "source": [ "
\n", "3. Prepare data
" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "453316e6-0dc3-41b0-9730-27c39ed9bdf1", "showTitle": false, "title": "" } }, "source": [ "

Some transformations are done to prepare dataset to be used to train a ML model.

\n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", " \n", "\n", "\n", " \n", " \n", "\n", "
column namecomment
tiptarget to predict. Contains numeric
total_billnumeric column to keep as is
sexContains Female and Male converted to 0 and 1
smokerContains yes and no converted to 0 and 1
timeContains Dinner and Lunch converted to 0 and 1
daycategorical column to One Hot Encode
sizecategorical column to One Hot Encode
" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "92c6fbbf-0a08-4fee-8ad7-abdf5a0f9ea4", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "tips_sdf = tips_sdf.selectExpr(\"total_bill\",\n", " \"tip\",\n", " \"case when sex = 'Female' then 1 else 0 end as sex\",\n", " \"case when smoker = 'yes' then 1 else 0 end as smoker\",\n", " \"case when time = 'Dinner' then 1 else 0 end as time\",\n", " \"day\",\n", " \"size\")\n", "#\n", "train_df, test_df = tips_sdf.randomSplit([.8, .2], seed=42)\n", "#\n", "ohe_cols = [\"size\", \"day\"]\n", "num_cols = [\"total_bill\", \"sex\", \"smoker\", \"time\"]\n", "target_col = \"tip\"\n", "#\n", "string_indexer = StringIndexer(inputCols=ohe_cols, outputCols=[c+\"_index\" for c in ohe_cols], handleInvalid=\"skip\")\n", "#\n", "ohe = OneHotEncoder()\n", "ohe.setInputCols([c+\"_index\" for c in ohe_cols])\n", "ohe.setOutputCols([c+\"_ohe\" for c in ohe_cols])\n", "#\n", "assembler_inputs = [c+\"_ohe\" for c in ohe_cols] + num_cols\n", "vec_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol=\"features\")" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "910af898-da90-4e26-a856-cdb4b902e101", "showTitle": false, "title": "" } }, "source": [ "
\n", "4. Evaluator and model
" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "06212c8c-e7bf-45e7-827f-fd3fcad64486", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "glr = GeneralizedLinearRegression(featuresCol=\"features\", labelCol=target_col, maxIter=10)\n", "lrm = LinearRegression(featuresCol=\"features\", labelCol=target_col)\n", "fmr = FMRegressor(featuresCol=\"features\", labelCol=target_col, stepSize=0.001)\n", "evaluator = RegressionEvaluator(labelCol=target_col, predictionCol=\"prediction\", metricName=\"rmse\")" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "cd1fc5a1-c77d-45e4-88b2-d2861900b3e5", "showTitle": false, "title": "" } }, "source": [ "\n", "
\n", "5. Perform MLflow experiment tracking workflows using model signatures and input examples
" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "6af34c76-f8fd-40a6-a62a-7dd1a94e88de", "showTitle": false, "title": "" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "inputs: \n", " ['total_bill': double, 'tip': double, 'sex': integer, 'smoker': integer, 'time': integer, 'day': string, 'size': long]\n", "outputs: \n", " ['tip': double]\n", "\n" ] } ], "source": [ "signature = mlflow.models.infer_signature(train_df, train_df[[\"tip\"]]);\n", "print(signature)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "40df8662-ab45-49a0-8069-8943078270eb", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
total_billtipsexsmokertimedaysize
08.772.00001Sun2
19.551.45001Sat2
29.941.56001Sun2
310.271.71001Sun2
410.292.60101Sun2
\n", "
" ] }, "metadata": { "application/vnd.databricks.v1+output": { "addedWidgets": {}, "arguments": {}, "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
total_billtipsexsmokertimedaysize
08.772.00001Sun2
19.551.45001Sat2
29.941.56001Sun2
310.271.71001Sun2
410.292.60101Sun2
\n
", "datasetInfos": [], "metadata": {}, "removedWidgets": [], "textData": null, "type": "htmlSandbox" } }, "output_type": "display_data" } ], "source": [ "input_example = train_df.toPandas().head()\n", "input_example" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "c0a1d573-d054-48bb-864a-fb9eab2efaa3", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "model_name = \"linear-regression\"\n", "#\n", "with mlflow.start_run(run_name=\"Tip-run\") as run:\n", " #\n", " # define pipeline stages according to model\n", " stages = [string_indexer, ohe, vec_assembler, lrm]\n", " #\n", " # set pipeline\n", " pipeline = Pipeline(stages=stages)\n", " #\n", " # fit pipeline to train set\n", " model = pipeline.fit(train_df)\n", " #\n", " # manually log model to mlflow\n", " mlflow.spark.log_model(model, model_name, signature=signature, input_example=input_example)\n", " #\n", " # manually log parameter to mlflow\n", " mlflow.log_param(\"maxIter\", 11)\n", " #\n", " # predict test set\n", " pred_df = model.transform(test_df)\n", " #\n", " # evaluate prediction\n", " rmse = evaluator.evaluate(pred_df)\n", " #\n", " # manually log metric to mlflow\n", " mlflow.log_metric(\"rmse\", rmse)" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "b7c1ad7c-c381-4758-bb59-5114ba6f0ba3", "showTitle": false, "title": "" } }, "source": [ "\n", "
\n", "6. Identify the requirements for tracking nested runs
" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "89edcb39-0b90-44ca-b6fd-5af69c3115a3", "showTitle": false, "title": "" } }, "source": [ "

It is possible to log to mlflow using nested runs:

\n", "" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "e5839d28-4117-400d-9a8c-d7fa5fbd0665", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "with mlflow.start_run(run_name=\"tips_evaluation\") as run_parent:\n", " #\n", " # loop on the three regression models\n", " for regression_model in [glr, lrm, fmr]:\n", " #\n", " # get model name\n", " model_name = regression_model.__str__().split(\"_\")[0]\n", " #\n", " # Nest mlflow logging\n", " with mlflow.start_run(run_name=model_name, nested=True) as run:\n", " #\n", " # define pipeline stages according to model\n", " stages = [string_indexer, ohe, vec_assembler, regression_model]\n", " #\n", " # set pipeline\n", " pipeline = Pipeline(stages=stages)\n", " #\n", " # fit pipeline to train set\n", " model = pipeline.fit(train_df)\n", " #\n", " # log model to mlflow\n", " mlflow.spark.log_model(model, model_name, signature=signature, input_example=input_example)\n", " #\n", " # predict test set\n", " pred_df = model.transform(test_df)\n", " #\n", " # evaluate prediction\n", " rmse = evaluator.evaluate(pred_df)\n", " #\n", " # log evaluation to mlflow\n", " mlflow.log_metric(\"rmse\", rmse)" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "f04a8cf6-a501-4e11-a7af-66b9b9bd6744", "showTitle": false, "title": "" } }, "source": [ "" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "29d945c5-a93c-4f84-a01b-341d71e9f980", "showTitle": false, "title": "" } }, "source": [ "\n", "
\n", "7. Describe the process of enabling autologging, including with the use of Hyperopt
" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "3ea640ea-2ab2-46f6-b53f-a440ef888681", "showTitle": false, "title": "" } }, "source": [ "
Here we enable mlflow logging with autolog() and train a simple model. This will automatically log everything possible for each library used.
" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "3b75ef74-0a1a-4740-8dc1-567388562b72", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "mlflow.autolog()" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "c28cef57-eb3d-40d6-91f0-24b5ea00505f", "showTitle": false, "title": "" } }, "source": [ "

Now let's fit and evaluate a model:

" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "2e9b4b6e-18be-4c01-ac94-ddfb7263b97b", "showTitle": false, "title": "" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Out[12]: 1.3054572176798678" ] } ], "source": [ "# fit pipeline to train set\n", "model_lrm_autolog = pipeline.fit(train_df)\n", "#\n", "# predict test set\n", "pred_df = model_lrm_autolog.transform(test_df)\n", "#\n", "# evaluate\n", "evaluator.evaluate(pred_df)" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "431483d6-48ce-4392-87c2-95dabcfd87c8", "showTitle": false, "title": "" } }, "source": [ "

After that, in MLflow UI, we can see the many parameters that have been logged.

\n", "

Alternatively, we can get and see the logged parameters for latest run programmaticaly:

" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "eb984d77-72af-4a8b-8ae7-14188488963a", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "text/html": [ "
run_idexperiment_idstatusartifact_uristart_timeend_timemetrics.rmse_test_dfmetrics.rmsemetrics.rmse_unknown_datasetparams.FMRegressor.fitInterceptparams.FMRegressor.maxIterparams.OneHotEncoder.outputColparams.OneHotEncoder.outputColsparams.stagesparams.StringIndexer.stringOrderTypeparams.StringIndexer.inputColsparams.FMRegressor.tolparams.StringIndexer.outputColsparams.StringIndexer.handleInvalidparams.FMRegressor.solverparams.FMRegressor.factorSizeparams.OneHotEncoder.handleInvalidparams.OneHotEncoder.inputColsparams.StringIndexer.outputColparams.FMRegressor.fitLinearparams.FMRegressor.miniBatchFractionparams.VectorAssembler.handleInvalidparams.VectorAssembler.inputColsparams.FMRegressor.predictionColparams.FMRegressor.regParamparams.FMRegressor.labelColparams.FMRegressor.featuresColparams.FMRegressor.initStdparams.FMRegressor.stepSizeparams.OneHotEncoder.dropLastparams.FMRegressor.seedparams.VectorAssembler.outputColparams.maxIterparams.LinearRegression.maxIterparams.LinearRegression.standardizationparams.LinearRegression.tolparams.LinearRegression.solverparams.LinearRegression.elasticNetParamparams.LinearRegression.maxBlockSizeInMBparams.LinearRegression.featuresColparams.LinearRegression.labelColparams.LinearRegression.fitInterceptparams.LinearRegression.aggregationDepthparams.LinearRegression.lossparams.LinearRegression.predictionColparams.LinearRegression.epsilonparams.LinearRegression.regParamtags.mlflow.databricks.cluster.idtags.mlflow.databricks.cluster.libraries.errortags.mlflow.databricks.notebookRevisionIDtags.mlflow.databricks.workspaceIDtags.mlflow.databricks.notebook.commandIDtags.mlflow.source.typetags.mlflow.databricks.webappURLtags.mlflow.runNametags.estimator_classtags.mlflow.autologgingtags.mlflow.databricks.notebookIDtags.estimator_nametags.mlflow.parentRunIdtags.mlflow.rootRunIdtags.mlflow.log-model.history
08b972964438470595f2ba9ba6aa9d403541968995997190FINISHEDdbfs:/databricks/mlflow-tracking/3541968995997190/08b972964438470595f2ba9ba6aa9d40/artifacts2023-11-22T16:55:45.974+00002023-11-22T16:55:56.280+00001.3054572176798678nullnullTrue100OneHotEncoder_c95dbc53b9cc__output['size_ohe', 'day_ohe']['StringIndexer', 'OneHotEncoder', 'VectorAssembler', 'FMRegressor']frequencyDesc['size', 'day']1e-06['size_index', 'day_index']skipadamW8error['size_index', 'day_index']StringIndexer_c3caebc64717__outputTrue1.0error['size_ohe', 'day_ohe', 'total_bill', 'sex', 'smoker', 'time']prediction0.0tipfeatures0.010.001True-2921654334123211668featuresnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull1027-081006-5cgi5kuhThis message class grpc_shaded.com.databricks.api.proto.managedLibraries.ClusterStatus DID NOT match any methods in the stub class grpc_shaded.com.databricks.api.proto.cluster.ClusterServiceGrpc$ClusterServiceBlockingStub170067215661136075798609407187308506017976005609_4719503818729863905_49d8e6f9a405484da1266fc91cafd976NOTEBOOKhttps://eastus-c3.azuredatabricks.netcolorful-snake-723pyspark.ml.pipeline.Pipelinepyspark.ml3541968995997190Pipelinenullnullnull
" ] }, "metadata": { "application/vnd.databricks.v1+output": { "addedWidgets": {}, "aggData": [], "aggError": "", "aggOverflow": false, "aggSchema": [], "aggSeriesLimitReached": false, "aggType": "", "arguments": {}, "columnCustomDisplayInfos": {}, "data": [ [ "08b972964438470595f2ba9ba6aa9d40", "3541968995997190", "FINISHED", "dbfs:/databricks/mlflow-tracking/3541968995997190/08b972964438470595f2ba9ba6aa9d40/artifacts", "2023-11-22T16:55:45.974+0000", "2023-11-22T16:55:56.280+0000", 1.3054572176798678, null, null, "True", "100", "OneHotEncoder_c95dbc53b9cc__output", "['size_ohe', 'day_ohe']", "['StringIndexer', 'OneHotEncoder', 'VectorAssembler', 'FMRegressor']", "frequencyDesc", "['size', 'day']", "1e-06", "['size_index', 'day_index']", "skip", "adamW", "8", "error", "['size_index', 'day_index']", "StringIndexer_c3caebc64717__output", "True", "1.0", "error", "['size_ohe', 'day_ohe', 'total_bill', 'sex', 'smoker', 'time']", "prediction", "0.0", "tip", "features", "0.01", "0.001", "True", "-2921654334123211668", "features", null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, "1027-081006-5cgi5kuh", "This message class grpc_shaded.com.databricks.api.proto.managedLibraries.ClusterStatus DID NOT match any methods in the stub class grpc_shaded.com.databricks.api.proto.cluster.ClusterServiceGrpc$ClusterServiceBlockingStub", "1700672156611", "3607579860940718", "7308506017976005609_4719503818729863905_49d8e6f9a405484da1266fc91cafd976", "NOTEBOOK", "https://eastus-c3.azuredatabricks.net", "colorful-snake-723", "pyspark.ml.pipeline.Pipeline", "pyspark.ml", "3541968995997190", "Pipeline", null, null, null ] ], "datasetInfos": [], "dbfsResultPath": null, "isJsonSchema": true, "metadata": {}, "overflow": false, "plotOptions": { "customPlotOptions": {}, "displayType": "table", "pivotAggregation": null, "pivotColumns": null, "xColumns": null, "yColumns": null }, "removedWidgets": [], "schema": [ { "metadata": "{}", "name": "run_id", "type": "\"string\"" }, { "metadata": "{}", "name": "experiment_id", "type": "\"string\"" }, { "metadata": "{}", "name": "status", "type": "\"string\"" }, { "metadata": "{}", "name": "artifact_uri", "type": "\"string\"" }, { "metadata": "{}", "name": "start_time", "type": "\"timestamp\"" }, { "metadata": "{}", "name": "end_time", "type": "\"timestamp\"" }, { "metadata": "{}", "name": "metrics.rmse_test_df", "type": "\"double\"" }, { "metadata": "{}", "name": "metrics.rmse", "type": "\"double\"" }, { "metadata": "{}", "name": "metrics.rmse_unknown_dataset", "type": "\"double\"" }, { "metadata": "{}", "name": "params.FMRegressor.fitIntercept", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.maxIter", "type": "\"string\"" }, { "metadata": "{}", "name": "params.OneHotEncoder.outputCol", "type": "\"string\"" }, { "metadata": "{}", "name": "params.OneHotEncoder.outputCols", "type": "\"string\"" }, { "metadata": "{}", "name": "params.stages", "type": "\"string\"" }, { "metadata": "{}", "name": "params.StringIndexer.stringOrderType", "type": "\"string\"" }, { "metadata": "{}", "name": "params.StringIndexer.inputCols", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.tol", "type": "\"string\"" }, { "metadata": "{}", "name": "params.StringIndexer.outputCols", "type": "\"string\"" }, { "metadata": "{}", "name": "params.StringIndexer.handleInvalid", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.solver", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.factorSize", "type": "\"string\"" }, { "metadata": "{}", "name": "params.OneHotEncoder.handleInvalid", "type": "\"string\"" }, { "metadata": "{}", "name": "params.OneHotEncoder.inputCols", "type": "\"string\"" }, { "metadata": "{}", "name": "params.StringIndexer.outputCol", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.fitLinear", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.miniBatchFraction", "type": "\"string\"" }, { "metadata": "{}", "name": "params.VectorAssembler.handleInvalid", "type": "\"string\"" }, { "metadata": "{}", "name": "params.VectorAssembler.inputCols", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.predictionCol", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.regParam", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.labelCol", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.featuresCol", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.initStd", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.stepSize", "type": "\"string\"" }, { "metadata": "{}", "name": "params.OneHotEncoder.dropLast", "type": "\"string\"" }, { "metadata": "{}", "name": "params.FMRegressor.seed", "type": "\"string\"" }, { "metadata": "{}", "name": "params.VectorAssembler.outputCol", "type": "\"string\"" }, { "metadata": "{}", "name": "params.maxIter", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.maxIter", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.standardization", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.tol", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.solver", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.elasticNetParam", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.maxBlockSizeInMB", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.featuresCol", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.labelCol", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.fitIntercept", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.aggregationDepth", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.loss", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.predictionCol", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.epsilon", "type": "\"string\"" }, { "metadata": "{}", "name": "params.LinearRegression.regParam", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.databricks.cluster.id", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.databricks.cluster.libraries.error", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.databricks.notebookRevisionID", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.databricks.workspaceID", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.databricks.notebook.commandID", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.source.type", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.databricks.webappURL", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.runName", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.estimator_class", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.autologging", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.databricks.notebookID", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.estimator_name", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.parentRunId", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.rootRunId", "type": "\"string\"" }, { "metadata": "{}", "name": "tags.mlflow.log-model.history", "type": "\"string\"" } ], "type": "table" } }, "output_type": "display_data" } ], "source": [ "output_mlflow = (spark.createDataFrame(mlflow.search_runs())\n", " .drop(*['tags.mlflow.source.name',\n", " 'tags.mlflow.databricks.notebookPath',\n", " 'tags.mlflow.user',\n", " 'tags.mlflow.databricks.workspaceURL',\n", " 'tags.mlflow.databricks.cluster.info']))\n", "display(output_mlflow.orderBy(desc(\"end_time\")).limit(1))" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "54efcfb9-459f-4682-af3a-71d7b9f188bb", "showTitle": false, "title": "" } }, "source": [ "
Let's here use HyperOpt for hyperparameter tuning and use autolog() to log everything.
\n", "

HyperOpt:

\n", "" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "720a634b-bc15-41e4-ad84-93ff25a01bdf", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "def train_model(maxIter, regParam, elasticNetParam, labelCol):\n", " \"\"\"\n", " This train() function:\n", " - takes hyperparameters as inputs (for tuning later)\n", " - returns the rmse score on the test dataset\n", " \"\"\"\n", " # Use MLflow to track training.\n", " # Specify \"nested=True\" since this single model will be logged as a child run of Hyperopt's run.\n", " with mlflow.start_run(nested=True):\n", " #\n", " model_hyperopt = LinearRegression(maxIter=maxIter,\n", " regParam=regParam,\n", " elasticNetParam=elasticNetParam,\n", " labelCol=target_col)\n", " #\n", " evaluator_hyperopt = RegressionEvaluator(labelCol=target_col, predictionCol=\"prediction\")\n", " #\n", " stages = [string_indexer, ohe, vec_assembler, model_hyperopt]\n", " #\n", " # set pipeline\n", " pipeline = Pipeline(stages=stages)\n", " #\n", " # fit pipeline to train set\n", " model_rfr_hyperopt = pipeline.fit(train_df)\n", " #\n", " # predict test set\n", " pred_df = model_rfr_hyperopt.transform(test_df)\n", " #\n", " # evaluate\n", " rmse = evaluator_hyperopt.evaluate(pred_df)\n", " #\n", " # log rmse for each child run\n", " mlflow.log_metric(\"rmse\", rmse)\n", " #\n", " return model_rfr_hyperopt, rmse" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "9c6604cb-3318-4341-93ba-465ea966a13c", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "def objective(params):\n", " \"\"\" This function is the function to minimize by hyperopt \"\"\"\n", " #\n", " model, rmse = train_model(maxIter=params[\"maxIter\"],\n", " regParam=params[\"regParam\"],\n", " elasticNetParam=params[\"elasticNetParam\"],\n", " labelCol=target_col)\n", " #\n", " return {'loss': rmse, 'status': STATUS_OK}" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "116f2066-932c-4268-98e0-2fc9ff77e6dd", "showTitle": false, "title": "" } }, "source": [ "

Let's define the hyperparameter search spaces:

" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "8edfe198-1d5f-46fc-af5e-e9cd10bcc14f", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "search_spaces = {\"maxIter\": hp.quniform(\"maxIter\", 1, 100, 1),\n", " \"regParam\": hp.uniform(\"regParam\", 0.1, 10),\n", " \"elasticNetParam\": hp.uniform(\"elasticNetParam\", 0, 1)}" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "e1a9966d-c2da-46bb-a251-06213916ec80", "showTitle": false, "title": "" } }, "source": [ "

Finally let's run the hyperparameter tuning with HyperOpt:

\n", "

As we are using a model from MLlib, we are going to use Trials class as value for trials parameter of the fmin function.

" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "1a5c898f-cb76-4e8b-ab32-8695efd97c23", "showTitle": false, "title": "" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\r", " 0%| | 0/15 [00:00See also this page or this video to learn more on HyperOpt.

" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "3d035eb8-cc74-47d6-aa7d-469b39fcb013", "showTitle": false, "title": "" } }, "source": [ "\n", "
\n", "8. Log and view artifacts like SHAP plots, custom visualizations, feature data, images, and metadata
\n", "

Looks like logging SHAP - SHapley Additive exPlanations - works with scikit-learn. So let's quickly train a model with scikit-learn library. For simplicity, let's keep day and time features out.

" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "7f360077-6a70-442f-9574-c7f2a0429116", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "mlflow.autolog(disable=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "8a18cd76-5efc-4b35-a6cd-b17b4c11586b", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "from sklearn.linear_model import LinearRegression\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.model_selection import train_test_split\n", "#\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "ee848b7e-7b52-45d9-84b2-9a030c20f13b", "showTitle": false, "title": "" } }, "outputs": [], "source": [ "# load dataset previously prepared\n", "pandas_tips = tips_sdf.toPandas()\n", "#\n", "# set features dataset\n", "pandas_tips_features = pandas_tips.drop([\"tip\", \"day\", \"time\"], axis=1)\n", "#\n", "# set target\n", "pandas_tips_target = pandas_tips[\"tip\"]\n", "#\n", "# train test split\n", "pd_df_X_train, pd_df_X_test, pd_df_y_train, pd_df_y_test = train_test_split(pandas_tips_features,\n", " pandas_tips_target,\n", " test_size=0.33,\n", " random_state=42)\n", "#\n", "# fit \n", "fitted_rfr_model = RandomForestRegressor().fit(pd_df_X_train, pd_df_y_train)" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "206ff31c-6ee9-45f2-b47c-56d1023e9556", "showTitle": false, "title": "" } }, "source": [ "

Here is an example of logging SHAP to mlflow:

" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "9574a53e-48ab-429d-9315-a730e3c45bf4", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "56a5641835084ae89614b0a460224285", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/81 [00:00Here is an example of logging figure to mlflow:

" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { "byteLimit": 2048000, "rowLimit": 10000 }, "inputWidgets": {}, "nuid": "5e0431f5-6c8f-4c5a-a639-d60e8c780bbe", "showTitle": false, "title": "" } }, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEpCAYAAACeISWkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAAeQElEQVR4nO3debRcZZ3u8e9DZIiaMJjIDAFENCAKHRkEG0VoCQh4mzFcHBBBbBFtBEFFBPSCYiuuFrwXlFllRgwSQG2Eq2BDAoIQEAkIkoASxmCQyPD0H3sfrRzOsHNSw6naz2ets6r2UHv/ap9z6lfvsN9XtomIiPpaptMBREREZyURRETUXBJBRETNJRFERNRcEkFERM0lEURE1FwSQYxqkj4v6XudjqNOcs3rR7mPoHdJehBYFXipYfUbbT+ylMf8qO2fL1103UfSccAbbO/f6Vi6lSQD84E1bL9YrlsWmAdMtK1y3fXAVsALgIH7gEuAU2wvKvc5jvw+miIlgt63q+3XNvyMOAk0g6RXdfL8I9WtcY9STwFTG5anluv6O9T2OGB14DPAvsAMSWp9iPWSRFBDklaUdKakRyXNk/QVSWPKbRtIuk7SE5Iel/QDSSuV284H1gGulPQXSZ+V9C5Jc/sd/0FJO5TPj5N0qaTvS1oAfHio8w8Q63GSvl8+nyTJkg6Q9LCkpyQdIuntkn4r6WlJpza89sOSbpR0qqRnJP1O0nsatq8habqkJyXNkXRQv/M2xn0I8Hlgn/K931Hud4CkeyQ9K+kBSR9rOMa7JM2V9BlJj5Xv94CG7WMlfUPSQ2V8v5I0tty2laSbyvd0h6R39XtfD5Tn/IOk/z3ItTtH0lf6x9OwfFR5/Z+VdG/ftRnkmn9I0h/Lv4kv9HsP55a/i3vKv4nF/h4GcD7wwYblDwLnDbaz7YW2rwd2A7YGdhnm+LGEkgjq6RzgReANwGbAvwAfLbcJOAlYA3gzsDZwHIDtDwB/5B+ljJMrnm934FJgJeAHw5y/ii2BDYF9gG8BXwB2ADYG9pa0Xb997wcmAF8CLpe0SrntQmBu+V73BE6UtP0gcZ8JnAhcVL73t5b7PAa8DxgPHACcImnzhmOsBqwIrAkcCJwmaeVy238A/wS8A1gF+CzwsqQ1gauAr5TrjwAukzRR0muA/wSmlt+W3wHcvgTXDgBJGwGHAm8vj/Ne4MEhXrItsBHwHuBYSW8u138JmASsD+wIVKmmuQL4Z0krldfincCPh3uR7T8Cs8r9o4mSCHrfFeW3yqclXSFpVWBn4NPlN63HgFMoit3YnmP7Z7YX2Z4PfBPYbvDDV/Jr21fYfpniA3PQ81f0ZdvP2/4psBC4wPZjtucBv6RILn0eA75l+wXbFwH3ArtIWhvYBjiqPNbtwPdY/Jvq3+O2/deBArF9le37XbgB+CmLf1C9AJxQnn8G8BdgI0nLAB8BPmV7nu2XbN9U1n/vD8ywPaM8988oPgB3Lo/5MrCJpLG2H7U9ewmuXZ+XgOWByZKWtf2g7fuH2P9423+1fQdwB9CXCPcGTrT9lO25FElqOM8DV1Ik8n2A6eW6Kh6hSI7RREkEve/9tlcqf94PrAssCzzalyCA04HXA0haVdKFZZXBAuD7FN+ml8bDDc+HPH9Ff254/tcBll/bsDzPi/eIeIiiBLAG8KTtZ/ttW3OQuAckaaqk/y6rl56m+LBuvF5P9DWKlp4r45sArEBRWulvXWCvhgT+NMU38tVtL6T48DyE4hpeJelNw8XZn+05wKcpSnuPlb/zNYZ4yZ8GeA9QXMfG6zTsNSudR5F0h6wWGsCawJNLsH9UkERQPw8Di4AJDQlivO2Ny+0nUvTSeIvt8RTfThsb5/p3M1sIvLpvoazrn9hvn8bXDHf+ZltTWqxxcR2Kb5WPAKtIGtdv27xB4n7FsqTlgcsoqnhWtb0SMIPFr9dgHqf4FrzBANseBs5vuD4r2X6N7a8C2L7W9o4Ujai/A747yDkW+91QVFP9483YP7S9LUXiMfC1CnH39yiwVsPy2hVf90uK+FcFflXlBWUp7p/K10YTJRHUjO1HKaovviFpvKRlVDQQ91X/jKOovnimrKs+st8h/kxRH9zn98AKknZR0Q3wGIoqh5Gev9leDxwmaVlJe1G0e8yw/TBwE3CSpBUkbUpRh//9IY71Z2BSWa0DsBzFe50PvChpKkV7x7DKarKzgG+qaLQeI2nrMrl8H9hV0nvL9SuUDb1rlSW23cu2gkUUv6uXBznN7cDOklaRtBpFCQAo2ggkbV+e73mKktRgxxnKxcDnJK1c/r0cWvH9G9gV2K1fie0VJL26/Pv4MXALRbKNJkoiqKcPUnyI3U3Rbe9Sim9nAMcDmwPPUDRYXt7vtScBx5RVFkfYfgb4N4r69XkU30KH6zUy1Pmb7WaKhuXHgf8D7Gn7iXLbNIqGzkeAHwFfGub+iEvKxyck3VZWKx1G8WH4FLAfRX13VUcAdwIzKao7vgYsUyap3Sl6Kc2nKCEcSfH/ugxweBnzkxTtNx8f5PjnU9TnP0iRfC9q2LY88FWK6/InioT5uSWIvc8JFL/vPwA/p/hdLqryQtuzh2nfOFXSsxQJ+FsUpa+dyiQaTZQbyqJnSfowxc1v23Y6lrqQ9HFgX9utKuFFC6REEBEjJml1SduUVXwbUdz49aNOxxVLJndLRsTSWI6i19d6wNMU92Z8p5MBxZJL1VBERM2laigiouaSCCIiaq7r2ggmTJjgSZMmdTqMiIiucuuttz5uu//NnkAXJoJJkyYxa9asTocREdFVJD002LZUDUVE1FwSQUREzSURRETUXBJBRETNJRFERNRcEkFERM0lEURE1FwSQUREzXXdDWWtMOnoqzodQiUPfnWXTocQET0oJYKIiJpLIoiIqLkkgoiImksiiIiouSSCiIiaSyKIiKi5JIKIiJpLIoiIqLkkgoiImksiiIiouSSCiIiaa2kikLSTpHslzZF09ADb15H0C0m/kfRbSTu3Mp6IiHilliUCSWOA04CpwGRgmqTJ/XY7BrjY9mbAvsB3WhVPREQMrJUlgi2AObYfsP034EJg9377GBhfPl8ReKSF8URExABaOQz1msDDDctzgS377XMc8FNJnwReA+zQwngiImIAnW4sngacY3stYGfgfEmviEnSwZJmSZo1f/78tgcZEdHLWpkI5gFrNyyvVa5rdCBwMYDtXwMrABP6H8j2Gban2J4yceLEFoUbEVFPrUwEM4ENJa0naTmKxuDp/fb5I/AeAElvpkgE+cofEdFGLUsEtl8EDgWuBe6h6B00W9IJknYrd/sMcJCkO4ALgA/bdqtiioiIV2rpnMW2ZwAz+q07tuH53cA2rYwhIiKG1unG4oiI6LAkgoiImksiiIiouSSCiIiaSyKIiKi5JIKIiJpLIoiIqLkkgoiImksiiIiouSSCiIiaSyKIiKi5JIKIiJpLIoiIqLlhE4GkWyV9QtLK7QgoIiLaq0qJYB9gDWCmpAslvVeSWhxXRES0ybCJwPYc218A3gj8EDgLeEjS8ZJWaXWAERHRWpXaCCRtCnwD+DpwGbAXsAC4rnWhRUREOww7Q5mkW4GngTOBo20vKjfdLCmzi0VEdLkqU1XuZfuBxhWS1rP9B9v/2qK4IiKiTapUDV1acV1ERHShQUsEkt4EbAysKKnxm/94YIVWBxYREe0xVNXQRsD7gJWAXRvWPwsc1MKYIiKijQZNBLZ/DPxY0ta2f93GmCIioo2Gqhr6rO2Tgf0kTeu/3fZhLY0sIiLaYqiqoXvKx1ntCCQiIjpjqKqhKyWNAd5i+4g2xhQREW00ZPdR2y8BuWksIqKHVbmh7HZJ04FLgIV9K21f3rKoIiKibaokghWAJ4DtG9YZSCKIiOgBwyYC2we0I5CIiOiMKoPOnU1RAliM7Y+0JKKIiGirKlVDP2l4vgLwv4BHWhNORES0W5WqocsalyVdAPyqZRFFRERbjWTy+g2B1zc7kIiI6IwqbQTPUrQRqHz8E3BUi+OKiIg2qVI1NK4dgURERGdUaSymnI9gW4oSwS9tX9HKoCIion2GbSOQ9B3gEOBO4C7gEEmntTqwiIhojyolgu2BN9s2gKRzgdktjSoiItqmSq+hOcA6Dctrl+uGJWknSfdKmiPp6EH22VvS3ZJmS/phleNGRETzVCkRjAPukXRLufx2YFY5EB22dxvoReUQ1qcBOwJzgZmSptu+u2GfDYHPAdvYfkpSuqVGRLRZlURw7AiPvQUwx/YDAJIuBHYH7m7Y5yDgNNtPAdh+bITnioiIEarSffQGAEnjG/e3/eQwL10TeLhheS6wZb993lge+0ZgDHCc7WuGDzsiIpqlyg1lBwMnAM8DL/OPG8vWb9L5NwTeBawF/H9Jb7H99AAxHAywzjrrEBERzVOlauhIYBPbjy/hsedRNCz3Watc12gucLPtF4A/SPo9RWKY2biT7TOAMwCmTJnyipFQIyJi5Kr0GrofeG4Ex54JbChpPUnLAfsC0/vtcwVFaQBJEyiqih4YwbkiImKEqpQIPgfcJOlmYFHfStuHDfUi2y9KOhS4lqL+/yzbsyWdAMyyPb3c9i+S7gZeAo60/cQI30tERIxAlURwOnAdxZ3FLy/JwW3PAGb0W3dsw3MDh5c/ERHRAVUSwbK280EdEdGjqrQRXC3pYEmrS1ql76flkUVERFtUKRFMKx8/17CuWd1HIyKiw6rcULZeOwKJiIjOGDQRSNre9nXlXASvYPvy1oUVERHtMlSJYDuK3kK7DrDNQBJBREQPGDQR2P5S+XhA+8KJiIh2q9JrKCIielgSQUREzSURRETUXJX7CJD0DmASi89HcF6LYoqIiDaqMh/B+cAGwO0UA8NB0WsoiSAiogdUKRFMASaXA8RFRESPqdJGcBewWqsDiYiIzqhSIpgA3C3pFhafj2C3lkUVERFtUyURHNfqICIionOqDDp3QzsCiYiIzhhq0Llf2d5W0rMUvYT+volicrHxLY8uIiJabqixhrYtH8e1L5yIiGi33FkcEVFzSQQRETWXRBARUXOVEoGkdSXtUD4fKyntBhERPWLYRCDpIOBS4PRy1VrAFS2MKSIi2qhKieATwDbAAgDb9wGvb2VQERHRPlUSwSLbf+tbkPQqFr+vICIiuliVRHCDpM8DYyXtCFwCXNnasCIiol2qJIKjgfnAncDHgBnAMa0MKiIi2qfKWEMvA98FvitpFWCtzE0QEdE7qvQaul7S+DIJ3EqREE5pfWgREdEOVaqGVrS9APhX4DzbWwLvaW1YERHRLlUSwaskrQ7sDfykxfFERESbVUkEJwDXAnNsz5S0PnBfa8OKiIh2qdJYfAlFl9G+5QeAPVoZVEREtM+wiUDSCsCBwMbACn3rbX+khXFFRESbVKkaOh9YDXgvcAPFWEPPtjKoiIhonyqJ4A22vwgstH0usAuwZWvDioiIdqmSCF4oH5+WtAmwIhl0LiKiZwzbRgCcIWll4IvAdOC1wLEtjSoiItpm2BKB7e/Zfsr2DbbXt/162/+vysEl7STpXklzJB09xH57SLKkKUsSfERELL0qQ0ysKulMSVeXy5MlHVjhdWOA04CpwGRgmqTJA+w3DvgUcPOSBh8REUuvShvBORQ3lK1RLv8e+HSF121BcRPaA+V8BhcCuw+w35eBrwHPVzhmREQ0WZVEMMH2xcDLALZfBF6q8Lo1gYcblueW6/5O0ubA2ravGupAkg6WNEvSrPnz51c4dUREVFUlESyU9DrKWckkbQU8s7QnlrQM8E3gM8Pta/sM21NsT5k4ceLSnjoiIhpU6TV0OEVvoQ0k3QhMBPas8Lp5wNoNy2uV6/qMAzYBrpcExU1r0yXtZntWheNHREQTDJkIygbf7cqfjQAB99p+YajXlWYCG0pajyIB7Avs17fR9jPAhIZzXQ8ckSQQEdFeQ1YN2X4JmGb7Rduzbd9VMQn0tSUcStHQfA9wse3Zkk6QtNtSRx4REU1RpWroRkmnAhcBC/tW2r5tuBfankExx3HjugFvRrP9rgqxREREk1VJBG8rH09oWGdg+6ZHExERbVdlPoJ3tyOQiIjojCp3Fp8oaaWG5ZUlfaWlUUVERNtUuY9gqu2n+xZsPwXs3LKIIiKiraokgjGSlu9bkDQWWH6I/SMiootUaSz+AfBfks4ulw8Azm1dSBER0U5VGou/JukOYIdy1ZdtX9vasCIiol2qlAiguCHsRds/l/RqSeNsZ97iiIgeUKXX0EHApcDp5ao1gStaGFNERLRRlcbiTwDbAAsAbN9H5iyOiOgZVRLBonJiGQAkvYpySOqIiOh+VRLBDZI+D4yVtCNwCXBla8OKiIh2qZIIjgbmA3cCH6MYRO6YVgYVERHtU6X76MvAd8ufiIjoMYMmAkl3MkRbgO1NWxJRRES01VAlgveVj58oH88vH/cnjcURET1j0ERg+yEASTva3qxh01GSbqNoO4iIiC5XpbFYkrZpWHhHxddFREQXqDLExIHAWZJWLJefBj7SsogiIqKtqvQauhV4a18isP1My6OKiIi2qTroXBJARESPSl1/RETNJRFERNRcpaqhsqfQpMb9bZ/XopgiIqKNhk0Eks4HNgBuB14qVxtIIoiI6AFVSgRTgMm2czdxREQPqtJGcBewWqsDiYiIzqhSIpgA3C3pFmBR30rbu7UsqoiIaJsqieC4VgcRERGdU+XO4hvaEUhERHTGsG0EkraSNFPSXyT9TdJLkha0I7iIiGi9Ko3FpwLTgPuAscBHgdNaGVRERLRPpTuLbc8Bxth+yfbZwE6tDSsiItqlSmPxc5KWA26XdDLwKBmaIiKiZ1T5QP9Aud+hwEJgbWCPVgYVERHtU6XX0EOSxgKr2z6+DTFFREQbVek1tCvFOEPXlMtvkzS9xXFFRESbVKkaOg7YgmKKSmzfDqzXsogiIqKtqiSCFwaYnSwD0EVE9IgqiWC2pP2AMZI2lPRt4KYqB5e0k6R7Jc2RdPQA2w+XdLek30r6L0nrLmH8ERGxlKokgk8CG1MMOHcBsAD49HAvkjSG4sazqcBkYJqkyf12+w0wxfamwKXAyZUjj4iIpqjSa+g54Avlz5LYAphj+wEASRcCuwN3Nxz7Fw37/zew/xKeIyIiltKgiWC4nkEVhqFeE3i4YXkusOUQ+x8IXD1ILAcDBwOss846w5w2IiKWxFAlgq0pPsgvAG4G1KogJO1PMRPadgNtt30GcAbAlClT0lAdEdFEQyWC1YAdKQac2w+4CrjA9uyKx55HcRdyn7XKdYuRtANFtdN2thf13x4REa01aGNxOcDcNbY/BGwFzAGul3RoxWPPBDaUtF45VtG+wGLVTZI2A04HdrP92IjeQURELJUhG4slLQ/sQlEqmAT8J/CjKge2/WKZNK4FxgBn2Z4t6QRglu3pwNeB1wKXSAL4Y6bAjIhor6Eai88DNgFmAMfbvmtJD257Rvn6xnXHNjzfYUmPGRERzTVUiWB/itFGPwUcVn5jh6LR2LbHtzi2iIhog0ETge3MORARUQP5sI+IqLkkgoiImksiiIiouSSCiIiaSyKIiKi5JIKIiJpLIoiIqLkkgoiImksiiIiouSSCiIiaSyKIiKi5JIKIiJpLIoiIqLkkgoiImhtyhrKIiF4y6eirOh1CJQ9+dZe2ni8lgoiImksiiIiouSSCiIiaSyKIiKi5JIKIiJpLIoiIqLkkgoiImksiiIiouSSCiIiaSyKIiKi5JIKIiJpLIoiIqLkkgoiImksiiIiouSSCiIiaSyKIiKi5JIKIiJpLIoiIqLkkgoiImsucxdF0mRc2orukRBARUXNJBBERNdfSRCBpJ0n3Spoj6egBti8v6aJy+82SJrUynoiIeKWWJQJJY4DTgKnAZGCapMn9djsQeMr2G4BTgK+1Kp6IiBhYK0sEWwBzbD9g+2/AhcDu/fbZHTi3fH4p8B5JamFMERHRTyt7Da0JPNywPBfYcrB9bL8o6RngdcDjjTtJOhg4uFz8i6R7WxJxc02g3/tYWqp3eSnXs3mafi1rrlv+NtcdbENXdB+1fQZwRqfjWBKSZtme0uk4ekWuZ/PkWjZXL1zPVlYNzQPWblheq1w34D6SXgWsCDzRwpgiIqKfViaCmcCGktaTtBywLzC93z7TgQ+Vz/cErrPtFsYUERH9tKxqqKzzPxS4FhgDnGV7tqQTgFm2pwNnAudLmgM8SZEsekVXVWV1gVzP5sm1bK6uv57KF/CIiHrLncURETWXRBARUXNJBBERNZdEENHjJI2R9O+djiNGrzQWN4GkK4FBL6Tt3doYTk+QdKDtMxuWxwDH2D6+g2F1LUm32N6i03H0CkmrAicCa9ieWo6jtnXj32w3SSJoAknbDbXd9g3tiqVXSPohsBLFwISrAOcAN9g+ooNhdS1JpwDLAhcBC/vW276tY0F1MUlXA2cDX7D91vKG2N/YfkuHQxuRJIIYtSTtQzGC7UJgP9s3djikriXpFwOstu3t2x5MD5A00/bbJf3G9mblutttv63DoY1IV4w1NNpJupOhq4Y2bWM4PUHShsCngMuANwMfKP/pnutsZN3J9rs7HUOPWSjpdZT/95K2Ap7pbEgjl0TQHO/rdAA96ErgUNs/L4cmP5xi2JKNOxtWd+q1Ou1R4DMUQ+RsIOlGYCKwV2dDGrlUDcWoJGm87QX91r3R9u87FVM367U67dGgvIYbAQLutf1Ch0MasXQfbQJJvyofn5W0oP9jp+PrUmMlnSnpGoDyG+w7OxxTN5tg+2LgZSjGAgNe6mxI3UvS/cBHbc+2fZftFyT9pNNxjVQSQRPY3rZ8HGd7fP/HTsfXpc6hGLBw9XL598CnOxVMD+ipOu1R4AXg3ZLOLkdXhmKira6URNBkkjaXdJikT0rarNPxdLF8g22uw1m8Tvs84JOdDamrPWd7H+Ae4JeS1mGIDiOjXRqLm0jSsRQNRpeXq86RdIntr3QwrG6Vb7DN9RSwHQ112sDbOhlQlxOA7ZMl3Qb8lOJ+l66UxuImKudSfqvt58vlscDttjfqbGTdR9LmwLeBTYC7KHpl7Gn7tx0NrEtJuhXYzfa8cvmfgdPSWDwykna1fWXD8rrAh2yf0MGwRiwlguZ6BFgBeL5cXp5XTs8Z1WwATKWYynQPYEvy97o0DgGukLQrsDlwErBzZ0PqPpLeZPt3wLzyy0qjrm0szj9WE0j6NkUVxjPAbEk/K5d3BG7pZGxd7Iu2L5G0MvBu4D+A/0uREGIJ2Z4p6TCKKozngR1sz+9wWN3ocOBg4BsN6xqrVbryTu1UDTWBpA8Ntd32ue2KpVf03bov6STgTts/bLydP6oZYEDEycCjFG0GGRBxhCTtDVxje4GkL1KUsr7crWM3JRG0kaTLbO/R6Ti6Qdknex5FqWpz4K/ALbbf2tHAukwGRGwNSb+1vamkbYEvU5RYj7XdlSXWdB9tr/U7HUAX2ZviPoL32n6aokfGkR2NqAvZvqHvB/gdMK78uSdJYKn0dWXeBfiu7auA5YbYf1RLiaCNJN1mu38DU0TLlVUZXweup+j6+E7gSNuXdjKubtVrJdYkgjZKIohOkXQHsKPtx8rlicDPu/WDq9MkvRrYiaL96j5JqwNvsf3TDoc2Iuk11F7qdABRW8v0JYHSE6RqeMTK4dAvb1h+lKIRvislEbTXUZ0OIGrrGknXAheUy/sAV3cwnhhFUjXUBENMTCOKWaAyMU10nKQ9gG3KxV/a/lEn44nRI4mgCcrbywdl+6F2xRIxFEnjaagJsP1kB8OJUSKJIKIGJH0MOJ7iruKX+UdpNV2aI4mgmcoRMr9NMcfucsAYYGHmJIhOk3QfxdSUj3c6lhh90muguU4FpgH3AWOBjwKndTSiiML9wHOdDiJGp5QImkjSLNtT+m4/L9dlfJzouHKSpLOBm4FFfettH9axoGLUSPfR5nqunLbudkknU/QrTqkrRoPTgeuAOylnfYvokxJBE5W9h/5M0T7w78CKFJN/3N/RwKL2UjKNoeTbanO93/bzthfYPt724cD7Oh1UBHC1pIMlrS5plb6fTgcVo0NKBE000FhC+SYWo4GkPzQs/v2fPt1HA9JG0BSSpgH7AetJmt6waTyQG3ZiNDiKASZS6XBMMUokETTHTRQNwxNYfAq7Z4FMth6jwTG2Ly4nUtmeTP0ZDdJG0AS2H7J9ve2tWXzyj7m2X+xsdBFAj02kEs2VRNBEkvaimKx+L4oZtm6WtGdno4oAYJ6k0ylGHZ0haXny/x+lNBY3USb/iNGq1yZSieZKG0FzZfKPGJV6bSKVaK4kgua6eoDJP2Z0MJ6IiGHl22pzmeJW/k3LnzM6G05ExPDSRtBEg9xQ9vcB6CIiRqNUDTWBpI8D/wasL6nxvoFxwI2diSoiopqUCJpA0orAysBJwNENm57NVIARMdolEURE1FwaiyMiai6JICKi5pIIIiJqLokgIqLmkggiImrufwDQIMZWfkA14AAAAABJRU5ErkJggg==\n" }, "metadata": { "application/vnd.databricks.v1+output": { "addedWidgets": {}, "arguments": {}, "data": "\n", "datasetInfos": [], "metadata": {}, "removedWidgets": [], "type": "image" } }, "output_type": "display_data" } ], "source": [ "with mlflow.start_run(run_name=\"figure_tips\"):\n", " #\n", " # Generate feature importance plot thanks to feature_importances_ attribute of the RandomForestRegressor model\n", " feature_importances = pd.Series(fitted_rfr_model.feature_importances_, index=pd_df_X_train.columns)\n", " fig, ax = plt.subplots()\n", " feature_importances.plot.bar(ax=ax)\n", " ax.set_title(\"Feature importances using MDI\")\n", " ax.set_ylabel(\"Mean decrease in impurity\")\n", " #\n", " # Log figure to mlflow\n", " mlflow.log_figure(fig, \"feature_importance_rf.png\")" ] } ], "metadata": { "application/vnd.databricks.v1+notebook": { "dashboards": [], "language": "python", "notebookMetadata": { "mostRecentlyExecutedCommandWithImplicitDF": { "commandId": 1774797690553258, "dataframes": [ "_sqldf" ] }, "pythonIndentUnit": 2 }, "notebookName": "Databricks-ML-professional-S01c-Advanced-Experiment-Tracking", "widgets": {} }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 4 }