import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#
from pyspark.sql.functions import *
#
import mlflow
import logging
#
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
#
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
#
from databricks import feature_store

Number of rows test set: 17731
Number of rows train set: 36001
Sum of count rows of train and test set: 53732
Total number of rows of initial dataframe: 53732

Registered model 'scikit-learn_model' already exists. Creating a new version of this model...
Created version '7' of model 'scikit-learn_model'.

Registered model 'mllib_model' already exists. Creating a new version of this model...
Created version '4' of model 'mllib_model'.

dbfs:/user/hive/warehouse/diamonds_df_not_partitioned/_delta_log/
dbfs:/user/hive/warehouse/diamonds_df_not_partitioned/part-00000-ae36b44e-0c7d-4c9f-9a6b-303df7a6b41c-c000.snappy.parquet
dbfs:/user/hive/warehouse/diamonds_df_not_partitioned/part-00001-88c54dec-c999-415e-a6b7-e18f6fcf912c-c000.snappy.parquet
dbfs:/user/hive/warehouse/diamonds_df_not_partitioned/part-00002-05d4b875-93c6-49c3-a176-e25ac6c39cab-c000.snappy.parquet
dbfs:/user/hive/warehouse/diamonds_df_not_partitioned/part-00003-739559b9-0ff4-443f-a6ef-502f1733bae4-c000.snappy.parquet

dbfs:/user/hive/warehouse/diamonds_df_partitioned/_delta_log/
dbfs:/user/hive/warehouse/diamonds_df_partitioned/cut=Fair/
dbfs:/user/hive/warehouse/diamonds_df_partitioned/cut=Good/
dbfs:/user/hive/warehouse/diamonds_df_partitioned/cut=Ideal/
dbfs:/user/hive/warehouse/diamonds_df_partitioned/cut=Premium/
dbfs:/user/hive/warehouse/diamonds_df_partitioned/cut=Very Good/

2023/11/23 17:32:06 WARNING databricks.feature_store._compute_client._compute_client: Deleting a feature table can lead to unexpected failures in upstream producers and downstream consumers (models, endpoints, and scheduled jobs).
2023/11/23 17:32:09 INFO databricks.feature_store._compute_client._compute_client: Created feature table 'hive_metastore.default.diamonds_fs'.

Out[36]: RandomForestRegressor()

Out[37]: RandomForestRegressor()

Registered model 'trained_with_4_features' already exists. Creating a new version of this model...
Created version '7' of model 'trained_with_4_features'.

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#
from pyspark.sql.functions import *
#
import mlflow
import logging
#
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
#
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
#
from databricks import feature_store

logging.getLogger("mlflow").setLevel(logging.FATAL)

diamonds_df = sns.load_dataset('diamonds').drop(columns=['cut', 'clarity', 'color'], axis=1)
diamonds_df.sample(3)

diamonds_sdf = spark.createDataFrame(diamonds_df).dropDuplicates()
#
# Spark Dataframes
test_sdf = diamonds_sdf.orderBy(rand()).limit(int(33*diamonds_sdf.count()/100))
train_sdf = diamonds_sdf.subtract(test_sdf)
#
# Pandas Dataframes
test_df = test_sdf.toPandas()
train_df = train_sdf.toPandas()
#
print(f"Number of rows test set: {test_sdf.count()}")
print(f"Number of rows train set: {train_sdf.count()}")
print(f"Sum of count rows of train and test set: {train_sdf.count() + test_sdf.count()}")
print(f"Total number of rows of initial dataframe: {diamonds_sdf.count()}")

Number of rows test set: 17731
Number of rows train set: 36001
Sum of count rows of train and test set: 53732
Total number of rows of initial dataframe: 53732

# Prepare features and target dataframes
X = train_df.drop('price', axis=1)
y = train_df['price']
#
# train model (is automatically logged to mlflow)
rf = RandomForestRegressor(n_estimators=100, max_depth=5)
rf.fit(X, y)
#
# get latest run_id programmaticaly
latest_run_id = mlflow.search_runs().sort_values(by="end_time", ascending=False).head(1)['run_id'][0]
#
# uri to latest run (by default, artifact_path is 'model')
uri_scikit_learn = f"runs:/{latest_run_id}/model"
#
# register latest logged model
mlflow.register_model(uri_scikit_learn, name="scikit-learn_model")
#
# load latest registered model
scikit_learn_model = mlflow.pyfunc.load_model(uri_scikit_learn)
#
# prediction of test set using loaded model
pd.DataFrame(scikit_learn_model.predict(test_df.drop('price', axis=1)), columns=['predictions']).head(5)

Registered model 'scikit-learn_model' already exists. Creating a new version of this model...
Created version '7' of model 'scikit-learn_model'.

# set vector assembler parameters
assembler_inputs = [c for c in train_sdf.columns if c not in ['price']]
vec_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")
#
# instantiate model
mllib_rfr = LinearRegression(featuresCol="features", labelCol='price')
#
# define pipeline stages
stages = [vec_assembler, mllib_rfr]
#
# set pipeline
pipeline = Pipeline(stages=stages)
#
# fit pipeline to train set
model_mllib = pipeline.fit(train_sdf)
#
# get latest run_id programmaticaly
latest_run_id = mlflow.search_runs().sort_values(by="end_time", ascending=False).head(1)['run_id'][0]
#
# uri to latest run (by default, artifact_path is 'model')
uri_mllib = f"runs:/{latest_run_id}/model"
#
# register latest logged model
mlflow.register_model(uri_mllib, name="mllib_model")
#
# load latest registered model
mllib_model = mlflow.pyfunc.load_model(uri_mllib)
#
# Here predictions can be done using same input as for model trained using scikit learn library
pd.DataFrame(mllib_model.predict(test_df.drop('price', axis=1)), columns=['predictions']).head(5)

Registered model 'mllib_model' already exists. Creating a new version of this model...
Created version '4' of model 'mllib_model'.

# load model into a spark udf
predict_scikit_learn = mlflow.pyfunc.spark_udf(spark, uri_scikit_learn)
#
# make predictions on the spark test dataframe
display(test_sdf.withColumn("prediction", predict_scikit_learn(*[c for c in test_sdf.columns if c not in ['price']])).select("price", "prediction").limit(5))

(train_sdf.write
          .format("delta")
          .mode("overwrite")
          .option("overwriteSchema", "true")
          .saveAsTable("train_set_diamonds"))

display(spark.sql("describe table extended train_set_diamonds").filter("col_name in ('Location')"))

delta_partitioned_path = "dbfs:/user/hive/warehouse/train_set_diamonds"
#
spark.sql(f"OPTIMIZE delta.`{delta_partitioned_path}` ZORDER BY (carat)");

(spark.createDataFrame(sns.load_dataset('diamonds')).write
                                                    .format("delta")
                                                    .mode("overwrite")
                                                    .option("overwriteSchema", "true")
                                                    .saveAsTable("diamonds_df_not_partitioned"))

display(spark.sql("describe table extended diamonds_df_not_partitioned").filter("col_name in ('Location')"))

for file in dbutils.fs.ls("dbfs:/user/hive/warehouse/diamonds_df_not_partitioned"):
    print(file.path)

dbfs:/user/hive/warehouse/diamonds_df_not_partitioned/_delta_log/
dbfs:/user/hive/warehouse/diamonds_df_not_partitioned/part-00000-ae36b44e-0c7d-4c9f-9a6b-303df7a6b41c-c000.snappy.parquet
dbfs:/user/hive/warehouse/diamonds_df_not_partitioned/part-00001-88c54dec-c999-415e-a6b7-e18f6fcf912c-c000.snappy.parquet
dbfs:/user/hive/warehouse/diamonds_df_not_partitioned/part-00002-05d4b875-93c6-49c3-a176-e25ac6c39cab-c000.snappy.parquet
dbfs:/user/hive/warehouse/diamonds_df_not_partitioned/part-00003-739559b9-0ff4-443f-a6ef-502f1733bae4-c000.snappy.parquet

display(spark.table("diamonds_df_not_partitioned").groupBy("cut").count().orderBy(desc('count')))

(spark.table("diamonds_df_not_partitioned")
      .write.partitionBy("cut")
      .format("delta")
      .mode("overwrite")
      .option("overwriteSchema", "true")
      .saveAsTable("diamonds_df_partitioned"))

for file in dbutils.fs.ls("dbfs:/user/hive/warehouse/diamonds_df_partitioned"):
    print(file.path)

dbfs:/user/hive/warehouse/diamonds_df_partitioned/_delta_log/
dbfs:/user/hive/warehouse/diamonds_df_partitioned/cut=Fair/
dbfs:/user/hive/warehouse/diamonds_df_partitioned/cut=Good/
dbfs:/user/hive/warehouse/diamonds_df_partitioned/cut=Ideal/
dbfs:/user/hive/warehouse/diamonds_df_partitioned/cut=Premium/
dbfs:/user/hive/warehouse/diamonds_df_partitioned/cut=Very Good/

pd_diamonds = sns.load_dataset('diamonds').reset_index()
#
diamonds_full = spark.createDataFrame(pd_diamonds).withColumnRenamed('x', 'x_r')
#
display(diamonds_full.limit(5))

# create a feature store client
fs = feature_store.FeatureStoreClient()
#
# fs.drop_table("default.diamonds_fs")
#
# create feature table - as only the scema is provided in the command below, it will only create the table structure without populating it with data
result = fs.create_table(name="diamonds_fs",                          # required
                         primary_keys=["index"],                      # required
                         schema=diamonds_full.drop("price").schema,   # need either dataframe schema
                         #df=diamonds_full,                           # or dataframe itself
                         description="seaborn diamonds dataset");

2023/11/23 17:32:06 WARNING databricks.feature_store._compute_client._compute_client: Deleting a feature table can lead to unexpected failures in upstream producers and downstream consumers (models, endpoints, and scheduled jobs).
2023/11/23 17:32:09 INFO databricks.feature_store._compute_client._compute_client: Created feature table 'hive_metastore.default.diamonds_fs'.

fs.write_table(name="diamonds_fs",
               df=diamonds_full.drop('price'),
               mode='merge')

y_test  = diamonds_full.select("price", "index").orderBy(rand()).limit(int(33*diamonds_full.count()/100))
y_train = diamonds_full.select("price", "index").subtract(y_test)
#
display(y_train.limit(5))

# With 4 features: x, y, z, carat
feature_lookups_4_features = [feature_store.FeatureLookup(table_name="diamonds_fs",
                                                          feature_names=['x_r', 'y', 'z', 'carat'],
                                                          lookup_key="index")]
#
# create associated training set
train_set_4_features = fs.create_training_set(y_train,
                                              feature_lookups_4_features,
                                              label="price",
                                              exclude_columns="index")
#
# load training set
train_set_4 = train_set_4_features.load_df()
#
# display to check
display(train_set_4.limit(5))

# With all numerical features
feature_lookups_all_features = [feature_store.FeatureLookup(table_name="diamonds_fs",
                                                            feature_names=[c for c in diamonds_full.columns if c not in ['index', 'cut', 'clarity', 'price', 'color']],
                                                            lookup_key="index")]
#
# create associated training set
train_set_all_features = fs.create_training_set(y_train,
                                                feature_lookups_all_features,
                                                label="price",
                                                exclude_columns="index")
#
# load training set
train_set_all = train_set_all_features.load_df()
#
# display to check
display(train_set_all.limit(5))

X_train_4 = train_set_4.drop("price").toPandas()
y_train_4 = train_set_4.toPandas()["price"]
#
rf_4_model = RandomForestRegressor()
#
rf_4_model.fit(X_train_4, y_train_4)

Out[36]: RandomForestRegressor()

X_train_all = train_set_all.drop("price").toPandas()
y_train_all = train_set_all.toPandas()["price"]
#
rf_all_model = RandomForestRegressor()
#
rf_all_model.fit(X_train_all, y_train_all)

Out[37]: RandomForestRegressor()

model_name_4_features = "trained_with_4_features"
#
fs.log_model(rf_4_model,
             artifact_path=model_name_4_features,            # parameter required
             flavor=mlflow.sklearn,                          # parameter required
             training_set=train_set_4_features,              # either training_set or feature_spec_path parameters required
             registered_model_name=model_name_4_features);   # not required. However model will not be linked to features in features store until model is registered

Registered model 'trained_with_4_features' already exists. Creating a new version of this model...
Created version '7' of model 'trained_with_4_features'.

model_name_all_features = "trained_with_all_features"
#
fs.log_model(rf_all_model,
             artifact_path=model_name_all_features,            # parameter required
             flavor=mlflow.sklearn,                            # parameter required
             training_set=train_set_all_features,              # either training_set or feature_spec_path parameters required
             registered_model_name=model_name_all_features);   # not required. However model will not be linked to features in features store until model is registered

Registered model 'trained_with_all_features' already exists. Creating a new version of this model...
Created version '6' of model 'trained_with_all_features'.

# latest run id for model named "trained_with_4_features"
for val in mlflow.MlflowClient().get_registered_model(model_name_4_features):
    if val[0]=='latest_versions':
        run_id_4 = val[1][0].run_id
#
# uri to latest run
uri_4_features = f"runs:/{run_id_4}/{model_name_4_features}"
print(uri_4_features)
#
# predict on test set
predictions_df_4_features = fs.score_batch(uri_4_features, y_test).select("price", "prediction");
display(predictions_df_4_features.orderBy(rand()).limit(5));

runs:/45a6dbcde3a1464a84a61353caf60365/trained_with_4_features

print("RMSE for model trained on 4 features:",
      mean_squared_error(predictions_df_4_features.toPandas()['price'], predictions_df_4_features.toPandas()['prediction'], squared=False))

RMSE for model trained on 4 features: 1448.8825377496491

# latest run id for model named "trained_with_all_features"
for val in mlflow.MlflowClient().get_registered_model(model_name_all_features):
    if val[0]=='latest_versions':
        run_id_all = val[1][0].run_id
#
# uri to latest run
uri_all_features = f"runs:/{run_id_all}/{model_name_all_features}"
print(uri_all_features)
#
# predict on test set
predictions_df_all_features = fs.score_batch(uri_all_features, y_test).select("price", "prediction", "carat");
display(predictions_df_all_features.select("price", "prediction").orderBy(rand()).limit(5));

runs:/45a6dbcde3a1464a84a61353caf60365/trained_with_all_features

print(mean_squared_error(predictions_df_all_features.toPandas()['price'], predictions_df_all_features.toPandas()['prediction'], squared=False))

1401.7197923544627

plt.figure(figsize=(17, 6))
#
sample_predictions = predictions_df_all_features.orderBy(rand()).limit(1000)
#
plt.scatter(list(sample_predictions.toPandas()['carat']), list(sample_predictions.toPandas()['price']), label='Actual Price', color='blue', marker='o')
plt.scatter(list(sample_predictions.toPandas()['carat']), list(sample_predictions.toPandas()['prediction']), label='Prediction', color='orange', marker='s')
#
# Adding labels and title
plt.xlabel('Carat')
plt.ylabel('Prices')
plt.title('Actual Price vs Prediction')
#
# Adding grid for better readability
plt.grid(True, linestyle='--', alpha=0.7)
#
# Adding legend
plt.legend()
#
# Show plot
plt.show();

diamonds_full.printSchema()

root
 |-- index: long (nullable = true)
 |-- carat: double (nullable = true)
 |-- cut: string (nullable = true)
 |-- color: string (nullable = true)
 |-- clarity: string (nullable = true)
 |-- depth: double (nullable = true)
 |-- table: double (nullable = true)
 |-- price: long (nullable = true)
 |-- x_r: double (nullable = true)
 |-- y: double (nullable = true)
 |-- z: double (nullable = true)

new_diamond = (diamonds_full.limit(1).withColumn('index',   lit(88887777).cast('long'))
                                     .withColumn('carat',   lit(2).cast('double'))
                                     .withColumn('cut',     lit('Good').cast('string'))
                                     .withColumn('color',   lit('E').cast('string'))
                                     .withColumn('clarity', lit('VS1').cast('string'))
                                     .withColumn('depth',   lit(40).cast('double'))
                                     .withColumn('table',   lit(64).cast('double'))
                                     .withColumn('x_r',     lit(4.14).cast('double'))
                                     .withColumn('y',       lit(3.5).cast('double'))
                                     .withColumn('z',       lit(2.1).cast('double')))
#
new_diamond_with_price = spark.createDataFrame(pd.DataFrame({'index': [88887777], 'price': [4500]}))
#
new_diamond_without_price = spark.createDataFrame(pd.DataFrame({'index': [88887777]}))
#
diamond_unknown = spark.createDataFrame(pd.DataFrame({'index': [98989898]}))
#
display(new_diamond)
display(new_diamond_with_price)
display(new_diamond_without_price)

fs.write_table(name="diamonds_fs",
               df=new_diamond,
               mode='merge')

# predict with price
predictions_new_diamond_with_price = fs.score_batch(uri_all_features, new_diamond_with_price)
display(predictions_new_diamond_with_price)
#
# predict without price
predictions_new_diamond_without_price = fs.score_batch(uri_all_features, new_diamond_without_price)
display(predictions_new_diamond_without_price)

# predict unknown diamond
predictions_unknown_diamond = fs.score_batch(uri_all_features, diamond_unknown)
display(predictions_unknown_diamond)

	carat	depth	table	price	x	y	z
26985	2.01	60.1	61.0	17068	8.14	8.06	4.87
29197	0.33	59.0	61.0	694	4.49	4.56	2.67
32340	0.30	62.1	56.0	789	4.29	4.31	2.67

	predictions
0	6771.932454
1	10611.573560
2	741.122107
3	1805.671346
4	1587.993408

	predictions
0	6711.408932
1	9540.179628
2	647.270082
3	2388.329445
4	1695.716464

price	prediction
4580	6771.9324540939115
8408	10611.573559702078
1103	741.122106786866
1332	1805.6713457303665
1293	1587.9934075468402

index	carat	cut	color	clarity	depth	table	price	x_r	y	z
0	0.23	Ideal	E	SI2	61.5	55.0	326	3.95	3.98	2.43
1	0.21	Premium	E	SI1	59.8	61.0	326	3.89	3.84	2.31
2	0.23	Good	E	VS1	56.9	65.0	327	4.05	4.07	2.31
3	0.29	Premium	I	VS2	62.4	58.0	334	4.2	4.23	2.63
4	0.31	Good	J	SI2	63.3	58.0	335	4.34	4.35	2.75

x_r	y	z	carat	price
3.87	3.78	2.49	0.22	337
3.94	3.96	2.48	0.24	336
3.95	3.98	2.43	0.23	326
4.2	4.23	2.63	0.29	334
3.95	3.98	2.47	0.24	336

carat	depth	table	x_r	y	z	price
0.22	65.1	61.0	3.87	3.78	2.49	337
0.24	62.8	57.0	3.94	3.96	2.48	336
0.23	61.5	55.0	3.95	3.98	2.43	326
0.29	62.4	58.0	4.2	4.23	2.63	334
0.24	62.3	57.0	3.95	3.98	2.47	336

price	prediction
5146	6148.454523809524
998	1054.7966666666666
722	920.675
756	911.7239999999999
4312	4765.28

price	prediction
3352	3267.8175
2281	1654.9560000000001
3601	5770.05
9032	8038.75
6126	6958.44

cut	count
Ideal	21551
Premium	13791
Very Good	12082
Good	4906
Fair	1610

Databricks-ML-professional-S03a-Batch