11. Results#

11.1. Load data#

Hide code cell source
ecommerce_df = (
    pd.read_csv(
        "../../data/Ecommerce_orders_Data.csv",
        header=0,
        dtype={
            "ORDER_ID": np.dtype("int64"),
            "MAIN_SYSTEM_ID": np.dtype("int64"),
            "ORDER_PRICE": np.dtype("float64"),
            "DISCOUNT": np.dtype("float64"),
            "ORDER_PRICE_AFTER_DISCOUNT": np.dtype("float64"),
            "ORDER_CREATION_DATE": np.dtype("O"),
        },
    )
    .assign(
        ORDER_CREATION_DATE=lambda x: pd.to_datetime(
            x["ORDER_CREATION_DATE"], infer_datetime_format=True
        ),
    )
    .drop(["DISCOUNT", "ORDER_PRICE_AFTER_DISCOUNT"], axis=1)
    .sort_values("ORDER_CREATION_DATE")
)

ecommerce_df = (
    ecommerce_df.assign(
        ROLLINGMEAN_30DAYS__ORDER_PRICE__=(
            lambda x: x.groupby("MAIN_SYSTEM_ID")
            .rolling("30D", on="ORDER_CREATION_DATE")["ORDER_PRICE"]
            .mean()
            .reset_index(drop=True)
        ),
        ROLLINGSTDDEV_30DAYS__ORDER_PRICE__=(
            lambda x: x.groupby("MAIN_SYSTEM_ID")
            .rolling("30D", on="ORDER_CREATION_DATE")["ORDER_PRICE"]
            .std()
            .reset_index(drop=True)
        ),
    )[
        [
            "MAIN_SYSTEM_ID",
            "ORDER_CREATION_DATE",
            "ORDER_PRICE",
            "ROLLINGMEAN_30DAYS__ORDER_PRICE__",
            "ROLLINGSTDDEV_30DAYS__ORDER_PRICE__",
        ]
    ]
    .sort_values("ORDER_CREATION_DATE")
    .reset_index(drop=True)
)
ecommerce_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 656473 entries, 0 to 656472
Data columns (total 5 columns):
 #   Column                               Non-Null Count   Dtype         
---  ------                               --------------   -----         
 0   MAIN_SYSTEM_ID                       656473 non-null  int64         
 1   ORDER_CREATION_DATE                  656473 non-null  datetime64[ns]
 2   ORDER_PRICE                          656473 non-null  float64       
 3   ROLLINGMEAN_30DAYS__ORDER_PRICE__    656473 non-null  float64       
 4   ROLLINGSTDDEV_30DAYS__ORDER_PRICE__  632899 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(1)
memory usage: 25.0 MB
Hide code cell source
metadata_cols = ["LOAN_ID", "LOAN_ISSUANCE_DATE", "LOAN_AMOUNT"]
loans_df = (
    pd.read_excel("../../data/Loans_Data.xlsx")
    .drop(
        [
            # Low signal columns
            "INITIAL_COST",
            "INDEX",
            "REPAYMENT_ID",
            "FINAL_COST",
            "RETAILER_ID",
            # Columns populated after the fact, thus would lead to data leak
            "REPAYMENT_UPDATED",
            "SPENT",
            "TOTAL_FINAL_AMOUNT",
            "FIRST_TRIAL_BALANCE",
            "FIRST_TRAIL_DELAYS",
            "PAYMENT_AMOUNT",
            "LOAN_PAYMENT_DATE",
            "REPAYMENT_AMOUNT",
            "CUMMULATIVE_OUTSTANDING",
            "PAYMENT_STATUS",
        ],
        axis=1,
    )
    .assign(
        MAIN_SYSTEM_ID=lambda x: x["MAIN_SYSTEM_ID"].astype("int"),
        LOAN_ID=lambda x: x["LOAN_ID"].astype("int64"),
        LOAN_ISSUANCE_DATE=lambda x: x["LOAN_ISSUANCE_DATE"].astype("<M8[ns]"),
        LOAN_AMOUNT=lambda x: x["LOAN_AMOUNT"].astype("float64"),
        TOTAL_INITIAL_AMOUNT=lambda x: x["TOTAL_INITIAL_AMOUNT"].astype("float64"),
        INITIAL_DATE=lambda x: x["INITIAL_DATE"].astype("<M8[ns]"),
    )
)

test_df = pd.merge(
    loans_df[metadata_cols],
    pd.read_parquet("../../data/test/pandas-pca-featureframe-maxdepth2-targetSPENT.parquet").assign(MAIN_SYSTEM_ID=lambda x: x["MAIN_SYSTEM_ID"].astype("int")),
    on="LOAN_ID"
).drop_duplicates()

test_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 56296 entries, 0 to 56295
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   LOAN_ID                56296 non-null  int64         
 1   LOAN_ISSUANCE_DATE     56296 non-null  datetime64[ns]
 2   LOAN_AMOUNT            56296 non-null  float64       
 3   MAIN_SYSTEM_ID         56296 non-null  int64         
 4   label                  56296 non-null  float64       
 5   principal_component_0  56296 non-null  float64       
 6   principal_component_1  56296 non-null  float64       
 7   principal_component_2  56296 non-null  float64       
 8   principal_component_3  56296 non-null  float64       
 9   principal_component_4  56296 non-null  float64       
 10  principal_component_5  56296 non-null  float64       
 11  principal_component_6  56296 non-null  float64       
 12  principal_component_7  56296 non-null  float64       
 13  principal_component_8  56296 non-null  float64       
 14  principal_component_9  56296 non-null  float64       
dtypes: datetime64[ns](1), float64(12), int64(2)
memory usage: 6.9 MB

11.2. Load model and get predictions#

Hide code cell source
with open("assets/sagemaker-flaml-automl-regression-maxdepth2-targetSPENT-3.pkl", "rb") as fp:
    automl: AutoML = pickle.load(fp)

test_df = test_df.assign(
    y_pred=automl.predict(test_df.drop(metadata_cols + ["MAIN_SYSTEM_ID", "label"], axis=1))
).sort_values("LOAN_ISSUANCE_DATE")

11.3. Join fields to generate final dataframe and report#

Hide code cell source
merged_df = pd.merge_asof(
    test_df,
    ecommerce_df,
    left_on="LOAN_ISSUANCE_DATE",
    right_on="ORDER_CREATION_DATE",
    by="MAIN_SYSTEM_ID",
    direction="backward",
    tolerance=pd.Timedelta("7d"),
).query("LOAN_ISSUANCE_DATE > ORDER_CREATION_DATE")

merged_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 34418 entries, 0 to 56293
Data columns (total 20 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   LOAN_ID                              34418 non-null  int64         
 1   LOAN_ISSUANCE_DATE                   34418 non-null  datetime64[ns]
 2   LOAN_AMOUNT                          34418 non-null  float64       
 3   MAIN_SYSTEM_ID                       34418 non-null  int64         
 4   label                                34418 non-null  float64       
 5   principal_component_0                34418 non-null  float64       
 6   principal_component_1                34418 non-null  float64       
 7   principal_component_2                34418 non-null  float64       
 8   principal_component_3                34418 non-null  float64       
 9   principal_component_4                34418 non-null  float64       
 10  principal_component_5                34418 non-null  float64       
 11  principal_component_6                34418 non-null  float64       
 12  principal_component_7                34418 non-null  float64       
 13  principal_component_8                34418 non-null  float64       
 14  principal_component_9                34418 non-null  float64       
 15  y_pred                               34418 non-null  float64       
 16  ORDER_CREATION_DATE                  34418 non-null  datetime64[ns]
 17  ORDER_PRICE                          34418 non-null  float64       
 18  ROLLINGMEAN_30DAYS__ORDER_PRICE__    34418 non-null  float64       
 19  ROLLINGSTDDEV_30DAYS__ORDER_PRICE__  33152 non-null  float64       
dtypes: datetime64[ns](2), float64(16), int64(2)
memory usage: 5.5 MB

11.4. Make decision to approve or deny loans#

CONSERVATIVE_DECISION is when:

  • model predicts SPENT to be within 1 stddev of Ecommerce 30-days mean

  • LOAN_AMOUNT does not exceed 1 stddev of Ecommerce 30-days mean

AMBITIOUS_DECISION is when:

  • model predicts SPENT to be within 2 stddev of Ecommerce 30-days mean

  • LOAN_AMOUNT does not exceed 2 stddev of Ecommerce 30-days mean

(
    merged_df.assign(
        CONSERVATIVE_DECISION=(
            (merged_df["y_pred"] <= merged_df["ROLLINGMEAN_30DAYS__ORDER_PRICE__"] + merged_df["ROLLINGSTDDEV_30DAYS__ORDER_PRICE__"])
            & (merged_df["LOAN_AMOUNT"] <= merged_df["ROLLINGMEAN_30DAYS__ORDER_PRICE__"] + merged_df["ROLLINGSTDDEV_30DAYS__ORDER_PRICE__"])
        ),
        AMBITIOUS_DECISION=(
            (merged_df["y_pred"] <= merged_df["ROLLINGMEAN_30DAYS__ORDER_PRICE__"] + 2 * merged_df["ROLLINGSTDDEV_30DAYS__ORDER_PRICE__"])
            & (merged_df["LOAN_AMOUNT"] <= merged_df["ROLLINGMEAN_30DAYS__ORDER_PRICE__"] + 2 * merged_df["ROLLINGSTDDEV_30DAYS__ORDER_PRICE__"])
        ),
    )
    [[
        "LOAN_ID",
        "CONSERVATIVE_DECISION",
        "AMBITIOUS_DECISION"
    ]]
    .to_csv("../../data/risk-assessment-decisions.csv", header=True, index=False)
)

11.5. Result Assessment#

Result assessment is based only on 34,418 observations out of the 73,086 provided, roughly the 50% latest observations – see Splitting ML dataset for more info.

Is that a problem?

There is no such thing as perfect method, but there objectively better ones. We are forced to choose where to place uncertainty:

  • giving models all the data, increasing chance of overfitting (models memorize the data), thus weakening confidence in results

  • training models in “past” data and assessing them on “future” data, reducing the signal available for models to learn, but increasing a lot confidence on results

The latter is objectively better. Think of it like this: is it better to have a funny friend that lies to you all the time, or the awkward one that is always there for you? For a party (i.e. boasting about astonishingly unrealistic performance), you want the funny friend, but what about for life?

Another point of attention is that I will consider PAYMENT_STATUS in ('Unpaid', 'Partialy paid') as defaults. I will also consider defaults all cases when the retailer is not able to make the payment on the first collection attempt. Let me explain. I understand retailers are not always to blame for missed collection attempt, it sometimes falls on our operations agents. This can definitely be addressed in the future with a separate track: optimizing ops agents schedules to maximize collection rates. However, for this case study, I’m going to limit the scope of our assessment, and my judgement is that there is also a component of timing in a retailer’s ability to repay. This shifts the objective from being “we eventually want to collect debts” to “our forecast should also optimize for timing”. Of course, this goes beyond the scope of a case study, but it is an interesting domain to explore.

With all that in mind, let’s first take a look at some broad metrics:

Hide code cell source
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import expr
import pyspark.sql.types as spark_dtype
import pandas as pd


# Start Spark session just because I feel more comfortable with its API rather than Pandas
spark = SparkSession.builder.getOrCreate()

# Load Decisions and Loans datasets
decision_test_observations: DataFrame = (
    spark.read.option("header", "true")
    .schema(
        spark_dtype.StructType(
            [
                spark_dtype.StructField("LOAN_ID", spark_dtype.LongType(), nullable=False),
                spark_dtype.StructField("CONSERVATIVE_DECISION", spark_dtype.BooleanType(), nullable=False),
                spark_dtype.StructField("AMBITIOUS_DECISION", spark_dtype.BooleanType(), nullable=False),
            ]
        )
    )
    .csv("../../data/risk-assessment-decisions.csv")
)
loan_metadata: DataFrame = (
    spark.createDataFrame(
        pd.read_excel("../../data/Loans_Data.xlsx")
        .drop(
            [
                # Irrelevant columns for the current assessment
                "TOTAL_INITIAL_AMOUNT", # can be inferred from other columns
                "TOTAL_FINAL_AMOUNT", # can be inferred from other columns
                "INDEX",
                "REPAYMENT_ID",
                "RETAILER_ID",
                "REPAYMENT_UPDATED",
                "PAYMENT_AMOUNT",
                "LOAN_PAYMENT_DATE",
                "REPAYMENT_AMOUNT",
                "CUMMULATIVE_OUTSTANDING",

            ],
            axis=1,
        )
        .assign(
            MAIN_SYSTEM_ID=lambda x: x["MAIN_SYSTEM_ID"].astype("int64"),
            LOAN_ID=lambda x: x["LOAN_ID"].astype("int64"),
            LOAN_ISSUANCE_DATE=lambda x: x["LOAN_ISSUANCE_DATE"].astype("<M8[ns]"),
            LOAN_DUE_DATE=lambda x: x["INITIAL_DATE"].astype("<M8[ns]"),
            # Financial info
            LOAN_AMOUNT=lambda x: x["LOAN_AMOUNT"].astype("float64"),
            INITIAL_COST=lambda x: x["INITIAL_COST"].astype("float64"),
            SPENT=lambda x: x["SPENT"].astype("float64"),
            FIRST_TRIAL_BALANCE=lambda x: x["FIRST_TRIAL_BALANCE"].astype("float64"),
            FINAL_COST=lambda x: x["FINAL_COST"].astype("float64"),
        )
    )
    .where("LOAN_ISSUANCE_DATE > '2021-12-31'")
    .join(decision_test_observations.select("LOAN_ID"), on="LOAN_ID", how="inner")
    .withColumn("LABEL", expr("(lower(PAYMENT_STATUS) != 'paid') OR (FIRST_TRIAL_BALANCE < 0)"))
    #.persist()
)

loan_metadata.selectExpr(
    # Overview
    "count(distinct LOAN_ID) as nunique_loans",
    "count(distinct date_trunc('week', LOAN_ISSUANCE_DATE)) as nunique_weeks",
    "format_string('%.2f', sum(LOAN_AMOUNT)) as total_amount_requested",
    "format_string('%.2f', sum(SPENT)) as total_amount_spent",
    "format_string('%.4f', 1 - sum(SPENT) / sum(LOAN_AMOUNT)) as percentage_amount_stale",
    # "X as total_realized_loss_first_collection",
    # Counts
    "count(distinct LOAN_ID) filter (where LABEL = false) as nunique_loans_fullfiled_first_collection",
    "count(distinct LOAN_ID) filter (where LABEL = true) as nunique_loans_defaulted_first_collection",
).show(vertical=True)

From the above summary table, we can see that we observe 371 first-collection default observations, meaning loans that were not fully repaid in first collection attempt.

In the following plot we can see the volume of LOAN_AMOUNT (green) and SPENT (blue) over 10 weeks of testing period. The red line (right y-axis) shows the volume of failed first collection attempts that ocurred in reality. One other piece of information that becomes clear is our rate of stale capital in the difference between the green and blue bars: that gap represents money that is committed but not used, thus no fees, hence stale capital.

I also plot the count of loans over the same period on the second plot; the large drop-off in the last week of October is explained by incomplete data and can be ignored.

Hide code cell source
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter


wow_overview: DataFrame = (
    loan_metadata.selectExpr(
        "*",
        "date_trunc('week', LOAN_ISSUANCE_DATE) as week_dt"
    )
    .groupBy("week_dt")
    .agg(
        expr("count(distinct LOAN_ID) as nunique_loans"),
        expr("sum(LOAN_AMOUNT) as total_amount_requested"),
        expr("sum(SPENT) as total_amount_spent"),
        expr("abs(sum(FIRST_TRIAL_BALANCE) filter (where FIRST_TRIAL_BALANCE < 0)) as total_amount_loss"),
    )
    .selectExpr(
        "*",
        "nunique_loans / (lag(nunique_loans) over (order by week_dt asc))as nunique_loans_weekly_growth",
        "total_amount_requested / (lag(total_amount_requested) over (order by week_dt asc)) as total_amount_requested_weekly_growth",
        "total_amount_spent / (lag(total_amount_spent) over (order by week_dt asc)) as total_amount_spent_weekly_growth",
        "total_amount_loss / (lag(total_amount_loss) over (order by week_dt asc)) as total_amount_loss_weekly_growth",
    )
    .dropna()
    .orderBy("week_dt")
)

wow_overview.where("week_dt < '2022-10-31'").selectExpr(
    "avg(nunique_loans_weekly_growth)",
    "avg(total_amount_requested_weekly_growth)",
    "avg(total_amount_spent_weekly_growth)"
).show(1, vertical=True)

plot_data = wow_overview.toPandas().set_index("week_dt")
fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(15, 15))
ax1_2 = ax1.twinx()
ax1.yaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:,.0f}'.format(x)))
ax1_2.yaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:,.0f}'.format(x)))

ax1_2.plot(
    plot_data.index,
    plot_data["total_amount_loss"],
    color="orangered",
    label="total_amount_loss",
)
ax1.bar(
    plot_data.index,
    plot_data["total_amount_requested"],
    color="limegreen",
    label="total_amount_requested",
    width=5
)
ax1.bar(
    plot_data.index,
    plot_data["total_amount_spent"],
    color="steelblue",
    label="total_amount_spent",
    width=5
)

ax1_2.set_ylabel("Financial Loss Ground Truth",color="tab:red")
ax1.set_title("Volume of Loans")
ax1.set_xticks(plot_data.index)
ax1.set_xticklabels(plot_data.index.map(lambda x: str(x).split(" ")[0]), rotation=45) 

bar3 = plot_data.plot.bar(
    ax=ax2,
    y=["nunique_loans"],
    color=["slategrey"],
)
ax2.set_xlabel('Week')
ax2.set_title("Number of Loans")
# ax2.set_xticks(plot_data.index)
ax2.set_xticklabels(plot_data.index.map(lambda x: str(x).split(" ")[0]), rotation=45)
ax2.yaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:,.0f}'.format(x)))


ax1.legend()
plt.show()
23/05/08 12:58:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
-RECORD 0-------------------------------------------------------
 avg(nunique_loans_weekly_growth)          | 1.114742022449728  
 avg(total_amount_requested_weekly_growth) | 1.1265760210218776 
 avg(total_amount_spent_weekly_growth)     | 1.1250631960791917 

23/05/08 12:58:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
../_images/00348d2eb74f182f35a123d30beb3f9b343fcfafb13202a34d00ff26706cefa1.png

Now that we understand the Ground Truth (GT), I can benchmark my 2 Credit Rules: conservative and ambitious.

11.5.1. Conservative#

Conservative means that we are willing to approve a loan if:

\[\begin{split} (loan\_amount <= ecommerce\_average\_volume_{30days} + 1 * standard\_deviation_{30days}) \\ AND \\ (estimated\_spent <= ecommerce\_average\_volume_{30days} + 1 * standard\_deviation_{30days}) \end{split}\]

Conservative means we would have:

  • approved X (currency) out of Y requested in total

  • served X out of Y retailers that requested loans

  • accepted exposure wow:

  • realized loses wow:

  • opportunity gap wow:

Hide code cell source
import numpy as np

wow_conservative: DataFrame = (
    loan_metadata.select(
        "LOAN_ID",
        "LOAN_AMOUNT",
        "FIRST_TRIAL_BALANCE",
        "LOAN_ISSUANCE_DATE",
    )
    .join(decision_test_observations, on="LOAN_ID")
    .withColumn("week_dt", expr("date_trunc('week', LOAN_ISSUANCE_DATE)"))
    .groupBy("week_dt")
    .agg(
        expr("count(distinct LOAN_ID) as nunique_loans"),
        expr("count(distinct LOAN_ID) filter (where AMBITIOUS_DECISION = true) as nunique_loans_approved"),
        expr("count(distinct LOAN_ID) filter (where CONSERVATIVE_DECISION = true and FIRST_TRIAL_BALANCE < 0) as nunique_loans_approved_defaulted"),
        expr("count(distinct LOAN_ID) filter (where FIRST_TRIAL_BALANCE < 0) as nunique_loans_defaulted_gt"),
        # Volume
        expr("sum(LOAN_AMOUNT) as total_amount_requested"),
        expr("sum(LOAN_AMOUNT) filter (where CONSERVATIVE_DECISION = true) as amount_approved"),
        expr("sum(LOAN_AMOUNT) filter (where CONSERVATIVE_DECISION = true and FIRST_TRIAL_BALANCE < 0) as amount_approved_defaulted"),
        expr("sum(LOAN_AMOUNT) filter (where FIRST_TRIAL_BALANCE < 0) as total_amount_requested_defaulted_gt"),
        # expr("sum(SPENT) as total_amount_spent"),
        # expr("abs(sum(FIRST_TRIAL_BALANCE) filter (where FIRST_TRIAL_BALANCE < 0)) as total_amount_loss"),
    )
    .selectExpr(
        "*",
        "nunique_loans_approved / (lag(nunique_loans_approved) over (order by week_dt asc))as nunique_loans_approved_weekly_growth",
        "nunique_loans_approved_defaulted / (lag(nunique_loans_approved_defaulted) over (order by week_dt asc))as nunique_loans_approved_defaulted_weekly_growth",
        # "total_amount_requested / (lag(total_amount_requested) over (order by week_dt asc)) as total_amount_requested_weekly_growth",
        # "total_amount_spent / (lag(total_amount_spent) over (order by week_dt asc)) as total_amount_spent_weekly_growth",
        # "total_amount_loss / (lag(total_amount_loss) over (order by week_dt asc)) as total_amount_loss_weekly_growth",
    )
    .dropna()
    .orderBy("week_dt")
)

wow_conservative.where("week_dt < '2022-10-31'").selectExpr(
    "avg(nunique_loans_approved_weekly_growth)",
    "avg(nunique_loans_approved_defaulted_weekly_growth)",
).show(1, vertical=True)

conservative_plot_data = wow_conservative.toPandas().set_index("week_dt")
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize=(15, 12))
bar_width = .4
x_values = np.arange(len(conservative_plot_data.index))

ax1.bar(
    # conservative_plot_data.index,
    x_values + bar_width / 2,
    conservative_plot_data["nunique_loans_defaulted_gt"].values,
    color="red",
    label="nunique_loans_defaulted_gt",
    width=bar_width
)
ax1.bar(
    # conservative_plot_data.index,
    x_values - bar_width / 2,
    conservative_plot_data["nunique_loans"].values,
    color="steelblue",
    label="nunique_loans",
    width=bar_width
)
ax1.bar(
    # conservative_plot_data.index,
    x_values - bar_width / 2,
    conservative_plot_data["nunique_loans_approved"].values,
    color="limegreen",
    label="nunique_loans_approved",
    width=bar_width
)
ax1.bar(
    # conservative_plot_data.index,
    x_values - bar_width / 2,
    conservative_plot_data["nunique_loans_approved_defaulted"].values,
    color="lightcoral",
    label="nunique_loans_approved_defaulted",
    width=bar_width
)

ax1.set_title("Number of Loans")
ax1.set_xticks(x_values)
ax1.set_xticklabels(conservative_plot_data.index.map(lambda x: str(x).split(" ")[0]), rotation=45)
ax1.yaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:,.0f}'.format(x)))
ax1.legend()

ax2.bar(
    # conservative_plot_data.index,
    x_values + bar_width / 2,
    conservative_plot_data["nunique_loans_defaulted_gt"].values,
    color="red",
    label="nunique_loans_defaulted_gt",
    width=bar_width
)
ax2.bar(
    # conservative_plot_data.index,
    x_values - bar_width / 2,
    conservative_plot_data["nunique_loans_approved_defaulted"].values,
    color="lightcoral",
    label="nunique_loans_approved_defaulted",
    width=bar_width
)

ax2.set_title("Requests closer inspection")
ax2.set_xticks(x_values)
ax2.set_xticklabels(conservative_plot_data.index.map(lambda x: str(x).split(" ")[0]), rotation=45)
ax2.legend()

ax3.bar(
    # conservative_plot_data.index,
    x_values + bar_width / 2,
    conservative_plot_data["total_amount_requested_defaulted_gt"].values,
    color="red",
    label="total_amount_requested_defaulted_gt",
    width=bar_width
)
ax3.bar(
    # conservative_plot_data.index,
    x_values - bar_width / 2,
    conservative_plot_data["total_amount_requested"].values,
    color="steelblue",
    label="total_amount_requested",
    width=bar_width
)
ax3.bar(
    # conservative_plot_data.index,
    x_values - bar_width / 2,
    conservative_plot_data["amount_approved"].values,
    color="limegreen",
    label="amount_approved",
    width=bar_width
)
ax3.bar(
    # conservative_plot_data.index,
    x_values - bar_width / 2,
    conservative_plot_data["amount_approved_defaulted"].values,
    color="lightcoral",
    label="amount_approved_defaulted",
    width=bar_width
)

ax3.set_title("Volume of Loans")
ax3.set_xticks(x_values)
ax3.set_xticklabels(conservative_plot_data.index.map(lambda x: str(x).split(" ")[0]), rotation=45)
ax3.yaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:,.0f}'.format(x)))
ax3.legend()

ax4.bar(
    # conservative_plot_data.index,
    x_values + bar_width / 2,
    conservative_plot_data["total_amount_requested_defaulted_gt"].values,
    color="red",
    label="total_amount_requested_defaulted_gt",
    width=bar_width
)
ax4.bar(
    # conservative_plot_data.index,
    x_values - bar_width / 2,
    conservative_plot_data["amount_approved_defaulted"].values,
    color="lightcoral",
    label="amount_approved_defaulted",
    width=bar_width
)

ax4.set_title("Volume closer inspection")
ax4.set_xticks(x_values)
ax4.yaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:,.0f}'.format(x)))
ax4.set_xticklabels(conservative_plot_data.index.map(lambda x: str(x).split(" ")[0]), rotation=45)
ax4.legend()

plt.tight_layout()
plt.show()
23/05/08 12:58:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
-RECORD 0-----------------------------------------------------------------
 avg(nunique_loans_approved_weekly_growth)           | 1.1077757861859945 
 avg(nunique_loans_approved_defaulted_weekly_growth) | 1.0470072892397944 
23/05/08 12:58:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
../_images/ba0363851ef66f785581f90968f9fa67a51fe74d27fc5deae3321b1c9d105866.png

[WIP] Ideas, definitely no time to cover all – choose a couple:

  • daily rate of exposure (need to define exposure precisely, maybe stddev above 30-day mean predicted volume)

  • weekly rate of exposure vs. realized loss

  • recommend daily interest rates based to cover previous week exposure (if time allows, account for seasonality)

  • daily rate of stale capital: LOAN_AMOUNT vs. SPENT

  • weekly projected growth vs. realized growth

11.5.2. Ambitious#

Ambitious means that we are willing to approve a loan if the same formula above applies but with \(2 * standard\_deviation_{30days}\), higher multiplier for standard deviation upper bound.

By choosing to be ambitious, we are willing to provide loans to 13.6% more retailers, knowing these to be leverage-seekers. As mentioned before, hedging against financial loss via product pricing is a very important topic, but goes beyond the scope of this case study. However, just as food-for-thought, here are some parameters that have to be considered and others that can be adjusted:

  • the current available lending pool volume

  • the current exposure and coverage on volume already approved for lending in the period (e.g. daily)

    • if current volume_lent is made up mostly of conservative loans, we can be more ambitious until we reach our financial exposure threshold, or are no longer interested in adding overhead to pricing for non-financial reasons (e.g. bad press on high rates – “abusive rates” is clickbait headline that unfortunately sells)

  • our assumed percent rate of default_volume: nob adjusted based on standard_deviation multiplier

  • our margin_multiplier: nob adjusted based on growth vs. profit appetite

These are just a few parameters that we could observe and manipulate to keep operations healthy while keeping financial exposure under control. There is so much more to explore, this is indeed very exciting.

Finally, ambitious means we would have:

  • approved X (currency) out of Y requested in total

  • served X out of Y retailers that requested loans

  • accepted exposure wow:

  • realized loses wow:

  • opportunity gap wow:

Hide code cell source
import numpy as np

wow_ambitious: DataFrame = (
    loan_metadata.select(
        "LOAN_ID",
        "LOAN_AMOUNT",
        "FIRST_TRIAL_BALANCE",
        "LOAN_ISSUANCE_DATE",
    )
    .join(decision_test_observations, on="LOAN_ID")
    .withColumn("week_dt", expr("date_trunc('week', LOAN_ISSUANCE_DATE)"))
    .groupBy("week_dt")
    .agg(
        expr("count(distinct LOAN_ID) as nunique_loans"),
        expr("count(distinct LOAN_ID) filter (where AMBITIOUS_DECISION = true) as nunique_loans_approved"),
        expr("count(distinct LOAN_ID) filter (where AMBITIOUS_DECISION = true and FIRST_TRIAL_BALANCE < 0) as nunique_loans_approved_defaulted"),
        expr("count(distinct LOAN_ID) filter (where FIRST_TRIAL_BALANCE < 0) as nunique_loans_defaulted_gt"),
        # Volume
        expr("sum(LOAN_AMOUNT) as total_amount_requested"),
        expr("sum(LOAN_AMOUNT) filter (where AMBITIOUS_DECISION = true) as amount_approved"),
        expr("sum(LOAN_AMOUNT) filter (where AMBITIOUS_DECISION = true and FIRST_TRIAL_BALANCE < 0) as amount_approved_defaulted"),
        expr("sum(LOAN_AMOUNT) filter (where FIRST_TRIAL_BALANCE < 0) as total_amount_requested_defaulted_gt"),
        # expr("sum(SPENT) as total_amount_spent"),
        # expr("abs(sum(FIRST_TRIAL_BALANCE) filter (where FIRST_TRIAL_BALANCE < 0)) as total_amount_loss"),
    )
    .selectExpr(
        "*",
        "nunique_loans_approved / (lag(nunique_loans_approved) over (order by week_dt asc))as nunique_loans_approved_weekly_growth",
        "nunique_loans_approved_defaulted / (lag(nunique_loans_approved_defaulted) over (order by week_dt asc))as nunique_loans_approved_defaulted_weekly_growth",
        # "total_amount_requested / (lag(total_amount_requested) over (order by week_dt asc)) as total_amount_requested_weekly_growth",
        # "total_amount_spent / (lag(total_amount_spent) over (order by week_dt asc)) as total_amount_spent_weekly_growth",
        # "total_amount_loss / (lag(total_amount_loss) over (order by week_dt asc)) as total_amount_loss_weekly_growth",
    )
    .dropna()
    .orderBy("week_dt")
)

wow_ambitious.where("week_dt < '2022-10-31'").selectExpr(
    "avg(nunique_loans_approved_weekly_growth)",
    "avg(nunique_loans_approved_defaulted_weekly_growth)",
).show(1, vertical=True)

ambitious_plot_data = wow_ambitious.toPandas().set_index("week_dt")
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize=(15, 12))
bar_width = .4
x_values = np.arange(len(ambitious_plot_data.index))

ax1.bar(
    # ambitious_plot_data.index,
    x_values + bar_width / 2,
    ambitious_plot_data["nunique_loans_defaulted_gt"].values,
    color="red",
    label="nunique_loans_defaulted_gt",
    width=bar_width
)
ax1.bar(
    # ambitious_plot_data.index,
    x_values - bar_width / 2,
    ambitious_plot_data["nunique_loans"].values,
    color="steelblue",
    label="nunique_loans",
    width=bar_width
)
ax1.bar(
    # ambitious_plot_data.index,
    x_values - bar_width / 2,
    ambitious_plot_data["nunique_loans_approved"].values,
    color="limegreen",
    label="nunique_loans_approved",
    width=bar_width
)
ax1.bar(
    # ambitious_plot_data.index,
    x_values - bar_width / 2,
    ambitious_plot_data["nunique_loans_approved_defaulted"].values,
    color="lightcoral",
    label="nunique_loans_approved_defaulted",
    width=bar_width
)

ax1.set_title("Number of Loans")
ax1.set_xticks(x_values)
ax1.set_xticklabels(ambitious_plot_data.index.map(lambda x: str(x).split(" ")[0]), rotation=45)
ax1.yaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:,.0f}'.format(x)))
ax1.legend()

ax2.bar(
    # ambitious_plot_data.index,
    x_values + bar_width / 2,
    ambitious_plot_data["nunique_loans_defaulted_gt"].values,
    color="red",
    label="nunique_loans_defaulted_gt",
    width=bar_width
)
ax2.bar(
    # ambitious_plot_data.index,
    x_values - bar_width / 2,
    ambitious_plot_data["nunique_loans_approved_defaulted"].values,
    color="lightcoral",
    label="nunique_loans_approved_defaulted",
    width=bar_width
)

ax2.set_title("Requests closer inspection)")
ax2.set_xticks(x_values)
ax2.set_xticklabels(ambitious_plot_data.index.map(lambda x: str(x).split(" ")[0]), rotation=45)
ax2.legend()

ax3.bar(
    # ambitious_plot_data.index,
    x_values + bar_width / 2,
    ambitious_plot_data["total_amount_requested_defaulted_gt"].values,
    color="red",
    label="total_amount_requested_defaulted_gt",
    width=bar_width
)
ax3.bar(
    # ambitious_plot_data.index,
    x_values - bar_width / 2,
    ambitious_plot_data["total_amount_requested"].values,
    color="steelblue",
    label="total_amount_requested",
    width=bar_width
)
ax3.bar(
    # ambitious_plot_data.index,
    x_values - bar_width / 2,
    ambitious_plot_data["amount_approved"].values,
    color="limegreen",
    label="amount_approved",
    width=bar_width
)
ax3.bar(
    # ambitious_plot_data.index,
    x_values - bar_width / 2,
    ambitious_plot_data["amount_approved_defaulted"].values,
    color="lightcoral",
    label="amount_approved_defaulted",
    width=bar_width
)

ax3.set_title("Volume of Loans")
ax3.set_xticks(x_values)
ax3.set_xticklabels(ambitious_plot_data.index.map(lambda x: str(x).split(" ")[0]), rotation=45)
ax3.yaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:,.0f}'.format(x)))
ax3.legend()

ax4.bar(
    # ambitious_plot_data.index,
    x_values + bar_width / 2,
    ambitious_plot_data["total_amount_requested_defaulted_gt"].values,
    color="red",
    label="total_amount_requested_defaulted_gt",
    width=bar_width
)
ax4.bar(
    # ambitious_plot_data.index,
    x_values - bar_width / 2,
    ambitious_plot_data["amount_approved_defaulted"].values,
    color="lightcoral",
    label="amount_approved_defaulted",
    width=bar_width
)

ax4.set_title("Volume closer inspection")
ax4.set_xticks(x_values)
ax4.yaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:,.0f}'.format(x)))
ax4.set_xticklabels(ambitious_plot_data.index.map(lambda x: str(x).split(" ")[0]), rotation=45)
ax4.legend()

plt.tight_layout()
plt.show()
23/05/08 12:58:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
-RECORD 0-----------------------------------------------------------------
 avg(nunique_loans_approved_weekly_growth)           | 1.1077757861859945 
 avg(nunique_loans_approved_defaulted_weekly_growth) | 1.0640517734553776 
23/05/08 12:58:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/05/08 12:58:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
../_images/ce6e27cd72e029504d2a11f3d340d56f9189d43010e6ee65692cac93861371f4.png

11.6. Conservative vs Ambitious#

fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(15, 7))
bar_width = .4
x_values = np.arange(len(ambitious_plot_data.index))

ax1.bar(
    # ambitious_plot_data.index,
    x_values - bar_width / 2,
    conservative_plot_data["nunique_loans_approved_defaulted"].values,
    color="lightcoral",
    label="nunique_loans_approved_defaulted_conservative",
    width=bar_width
)
ax1.bar(
    # ambitious_plot_data.index,
    x_values + bar_width / 2,
    ambitious_plot_data["nunique_loans_approved_defaulted"].values,
    color="firebrick",
    label="nunique_loans_approved_defaulted_ambitious",
    width=bar_width
)

ax1.set_title("Requests comparisson")
ax1.set_xticks(x_values)
ax1.set_xticklabels(ambitious_plot_data.index.map(lambda x: str(x).split(" ")[0]), rotation=45)
ax1.yaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:,.0f}'.format(x)))
ax1.legend()

ax2.bar(
    # ambitious_plot_data.index,
    x_values - bar_width / 2,
    conservative_plot_data["amount_approved_defaulted"].values,
    color="red",
    label="amount_approved_defaulted_conservative",
    width=bar_width
)
ax2.bar(
    # ambitious_plot_data.index,
    x_values + bar_width / 2,
    ambitious_plot_data["amount_approved_defaulted"].values,
    color="lightcoral",
    label="amount_approved_defaulted_ambitious",
    width=bar_width
)

ax2.set_title("Volume comparisson")
ax2.set_xticks(x_values)
ax2.yaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:,.0f}'.format(x)))
ax2.set_xticklabels(ambitious_plot_data.index.map(lambda x: str(x).split(" ")[0]), rotation=45)
ax2.legend()

plt.show()
../_images/22df8508fe37373890dc01479f8f211a784c63b57206fcbb6a25043a2d2fbafb.png