10. Model Tuning#

import pickle
import warnings
from flaml.automl.data import get_output_from_log
from flaml import AutoML
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


warnings.filterwarnings("ignore", category=RuntimeWarning)
train_df = pd.read_parquet("../../data/train/pandas-pca-featureframe-maxdepth2-targetSPENT.parquet").sort_values("LOAN_ID")

# Let's remove retailers who never spent their borrowed amount, as these are likely to be first interactions with the product or mistakes
# train_df = train_df.query("label > 0")
train_df.info()
train_df.head(5)
automl = AutoML()
# Specify automl goal and constraint
automl_settings = {
    "task": 'regression',
    "metric": 'r2',
    "eval_method": "cv",
    "n_splits": 5,
    "split_type": "time",
    "early_stop": True,
    "n_jobs": -1,
    "n_concurrent_trials": 1,
    "time_budget": 3600,  # in seconds
    "log_file_name": "tune.log",
    "verbose": 2,
}
automl.fit(
    dataframe=train_df.drop(["LOAN_ID", "MAIN_SYSTEM_ID"], axis=1),
    label="label",
    **automl_settings
)
# Save the AutoML object to a file
with open("assets/automl.pkl", "wb") as f:
    pickle.dump(automl, f)
# Load the test data
test_df = pd.read_parquet("../../data/test/pandas-pca-featureframe-maxdepth2-targetSPENT.parquet").sort_values("LOAN_ID")
X_test = test_df.drop("label", axis=1)
y_test = test_df["label"]

# Perform predictions
y_pred = automl.predict(X_test)

# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

# Print the model performance report
print(f"Best Model: {automl.best_estimator}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R2): {r2:.4f}")

Best Model: rf Mean Absolute Error (MAE): 0.0004 Mean Squared Error (MSE): 0.0001 Root Mean Squared Error (RMSE): 0.0121 R-squared (R2): 0.5624

# time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history =
#     get_output_from_log(filename=settings["log_file_name"], time_budget=120)

# plt.title("Learning Curve")
# plt.xlabel("Wall Clock Time (s)")
# plt.ylabel("Validation Accuracy")
# plt.step(time_history, 1 - np.array(best_valid_loss_history), where="post")
# plt.show()