import pandas as pd
import seaborn as sns
#
from pyspark.sql.functions import *
#
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
#
import mlflow
#
import logging

logging.getLogger("mlflow").setLevel(logging.FATAL)

tips_df = sns.load_dataset("tips")
#
tips_sdf = spark.createDataFrame(tips_df)
#
display(tips_sdf.limit(5))

display(tips_sdf.filter("size is null"))

tips_sdf = tips_sdf.selectExpr("total_bill",
                               "tip",
                               "case when sex = 'Female' then 1 else 0 end as sex",
                               "case when smoker = 'yes' then 1 else 0 end as smoker",
                               "case when time = 'Dinner' then 1 else 0 end as time",
                               "day",
                               "size")
#
train_df, test_df = tips_sdf.randomSplit([.8, .2])
#
ohe_cols = ["size", "day"]
num_cols = ["total_bill", "sex", "smoker", "time"]
target_col = "tip"
#
string_indexer = StringIndexer(inputCols=ohe_cols, outputCols=[c+"_index" for c in ohe_cols], handleInvalid="skip")
#
ohe = OneHotEncoder()
ohe.setInputCols([c+"_index" for c in ohe_cols])
ohe.setOutputCols([c+"_ohe" for c in ohe_cols])
#
assembler_inputs = [c+"_ohe" for c in ohe_cols] + num_cols
vec_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

gbt =       GBTRegressor(featuresCol="features", labelCol=target_col, maxIter=5)
evaluator = RegressionEvaluator(labelCol=target_col, predictionCol="prediction", metricName="rmse")

model_name = "GBT-Regressor"
#
with mlflow.start_run(run_name="Tip-run") as run:
    #
    # define pipeline stages according to model
    stages = [string_indexer, ohe, vec_assembler, gbt]
    #
    # set pipeline
    pipeline = Pipeline(stages=stages)
    #
    # fit pipeline to train set
    model = pipeline.fit(train_df)
    #
    # manually log model to mlflow
    mlflow.spark.log_model(model, model_name)
    #
    # manually log parameter to mlflow
    mlflow.log_param("maxIter", 5)
    #
    # predict test set
    pred_df = model.transform(test_df)
    #
    # evaluate prediction
    rmse = evaluator.evaluate(pred_df)
    #
    # manually log metric to mlflow
    mlflow.log_metric("rmse", rmse)

mlflow.search_runs().drop(['tags.mlflow.databricks.workspaceURL',
                           'tags.mlflow.databricks.notebookPath',
                           'tags.mlflow.source.name',
                           'tags.mlflow.user'], axis=1)

mlflow.search_runs()[["tags.mlflow.runName", "run_id", "params.maxIter", "metrics.rmse"]].sort_values(by=['metrics.rmse'], ascending=True)

mlflow.search_runs(filter_string="tags.mlflow.runName like '%Tip%' and metrics.rmse<=1.5")[["tags.mlflow.runName", "run_id", "params.maxIter", "metrics.rmse"]]

bestModelRunId = mlflow.search_runs().sort_values(by=['metrics.rmse'], ascending=True).head(1)["run_id"].values[0]
#
best_model_path = f"runs:/{bestModelRunId}/{model_name}"
print(f"Best model path is: {best_model_path}")
#
loaded_model = mlflow.spark.load_model(best_model_path)

Best model path is: runs:/05c39cdd803d41aaaf1d95e938bccb3c/GBT-Regressor

display(loaded_model.transform(test_df).select("tip", "prediction"))

total_bill	tip	sex	smoker	day	time	size
16.99	1.01	Female	No	Sun	Dinner	2
10.34	1.66	Male	No	Sun	Dinner	3
21.01	3.5	Male	No	Sun	Dinner	3
23.68	3.31	Male	No	Sun	Dinner	2
24.59	3.61	Female	No	Sun	Dinner	4

column name	comment
`tip`	target to predict. Contains numeric
`total_bill`	numeric column to keep as is
`sex`	Contains `Female` and `Male` converted to `0` and `1`
`smoker`	Contains `yes` and `no` converted to `0` and `1`
`time`	Contains `Dinner` and `Lunch` converted to `0` and `1`
`day`	categorical column to One Hot Encode
`size`	categorical column to One Hot Encode

	run_id	experiment_id	status	artifact_uri	start_time	end_time	metrics.rmse	params.maxIter	tags.mlflow.databricks.cluster.libraries.error	tags.mlflow.databricks.notebookRevisionID	tags.mlflow.databricks.workspaceID	tags.mlflow.databricks.notebookID	tags.sparkDatasourceInfo	tags.mlflow.log-model.history	tags.mlflow.databricks.notebook.commandID	tags.mlflow.source.type	tags.mlflow.databricks.webappURL	tags.mlflow.runName	tags.mlflow.databricks.cluster.info	tags.mlflow.databricks.cluster.id
0	3b461072c96e4ce492250c86e5d4b04a	4106912166953874	FINISHED	dbfs:/databricks/mlflow-tracking/4106912166953...	2023-11-22 16:47:43.155000+00:00	2023-11-22 16:48:13.660000+00:00	1.685712	5	This message class grpc_shaded.com.databricks....	1700671694065	3607579860940718	4106912166953874	path=mlflowdbfs:/artifacts?run_id=05c39cdd803d...	[{"artifact_path":"GBT-Regressor","flavors":{"...	4723686315041778941_5042614833210616149_6f6f9d...	NOTEBOOK	https://eastus-c3.azuredatabricks.net	Tip-run	{"cluster_name":"Victor Bonnet's Cluster","spa...	1027-081006-5cgi5kuh
1	05c39cdd803d41aaaf1d95e938bccb3c	4106912166953874	FINISHED	dbfs:/databricks/mlflow-tracking/4106912166953...	2023-11-22 16:43:02.360000+00:00	2023-11-22 16:43:32.223000+00:00	1.071217	5	This message class grpc_shaded.com.databricks....	1700671412555	3607579860940718	4106912166953874	None	[{"artifact_path":"GBT-Regressor","flavors":{"...	4723686315041778941_8003943757913343583_2f0694...	NOTEBOOK	https://eastus-c3.azuredatabricks.net	Tip-run	{"cluster_name":"Victor Bonnet's Cluster","spa...	1027-081006-5cgi5kuh
2	197b10148baf4c55a9cfd55cb716a45f	4106912166953874	FINISHED	dbfs:/databricks/mlflow-tracking/4106912166953...	2023-11-22 16:42:20.492000+00:00	2023-11-22 16:42:50.008000+00:00	1.568573	5	This message class grpc_shaded.com.databricks....	1700671370330	3607579860940718	4106912166953874	None	[{"artifact_path":"GBT-Regressor","flavors":{"...	4723686315041778941_8672488673836230069_a44c4b...	NOTEBOOK	https://eastus-c3.azuredatabricks.net	Tip-run	{"cluster_name":"Victor Bonnet's Cluster","spa...	1027-081006-5cgi5kuh
3	0f53885013194fdab9bc8ad6383e7bdf	4106912166953874	FINISHED	dbfs:/databricks/mlflow-tracking/4106912166953...	2023-11-22 16:37:23.545000+00:00	2023-11-22 16:37:53.810000+00:00	1.581776	5	This message class grpc_shaded.com.databricks....	1700671074137	3607579860940718	4106912166953874	None	[{"artifact_path":"GBT-Regressor","flavors":{"...	4723686315041778941_6064693358274921191_17ebf6...	NOTEBOOK	https://eastus-c3.azuredatabricks.net	Tip-run	{"cluster_name":"Victor Bonnet's Cluster","spa...	1027-081006-5cgi5kuh
4	15ba91e309d548c2a85c1f01c006e39b	4106912166953874	FINISHED	dbfs:/databricks/mlflow-tracking/4106912166953...	2023-11-22 16:35:48.939000+00:00	2023-11-22 16:36:25.395000+00:00	1.498592	5	This message class grpc_shaded.com.databricks....	1700670985719	3607579860940718	4106912166953874	None	[{"artifact_path":"GBT-Regressor","flavors":{"...	4723686315041778941_8549608798050021702_4078e9...	NOTEBOOK	https://eastus-c3.azuredatabricks.net	Tip-run	{"cluster_name":"Victor Bonnet's Cluster","spa...	1027-081006-5cgi5kuh

tip	prediction
1.32	2.2600362250689874
1.56	2.2600362250689874
1.57	2.270591780624543
2.0	2.4706720284816104
1.01	2.621464796497559
3.0	3.2463005089207058
2.5	3.1992041196791403
3.31	2.9060591599135535
3.12	4.0487121332066645
7.58	4.6646880493827165
1.0	1.3015395831199872
1.8	1.9777634977962606
2.0	1.9777634977962606
1.68	2.270591780624543
2.09	2.564973300067474
2.47	2.6026518714960454
4.06	3.227487583919192
4.0	4.361694589718562
4.73	4.687208975308642
2.5	6.218021382716049
1.5	1.5428140028467654
1.61	1.5533466844466506
2.0	1.5533466844466506
3.55	3.456976
2.0	1.5428140028467654
2.2	1.9777634977962606
2.01	1.9777634977962606
2.23	1.9882961793961458
2.0	1.9777634977962606
1.58	2.270591780624543
3.48	2.270591780624543
2.0	2.270591780624543
2.02	2.270591780624543
3.0	2.564973300067474
1.5	2.270591780624543
2.5	3.192036225068987
4.19	3.131679615987459
5.0	5.054009020863314
5.16	4.408080999391555
6.5	5.169339707135286
2.0	3.0488064
2.0	3.427246104519053
1.17	1.6696819697747305
5.0	4.990542308641975

Databricks-ML-professional-S01b-Experiment-Tracking