S&P 500 Volatility Clusters

Author

Jianyuan(Andy) Hu

Published

December 17, 2024

Project

Goal: Use the broad stock index to model and understand the index volatility levels.

Key methodology: Use KMeans to build clustering model and then volatility regimes and then build transitional probability distribution among the regimes

Import Library

Code
import pyprojroot
from pyprojroot.here import here
import os

import yfinance as yf
import pandas as pd
import numpy as np
import ibis
import ibis.selectors as s
from ibis import _
ibis.options.interactive = True
ibis.options.repr.interactive.max_rows = 20

from plotnine import ggplot, geom_line, geom_path, aes, facet_wrap, labs, scale_x_continuous, theme, element_text, scale_y_continuous, scale_x_date, scale_color_manual
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

Project File Paths

Code
base_path = pyprojroot.find_root(pyprojroot.has_file('.here'))
output_dir = os.path.join(base_path, "Data", "out")

Import Data

Get a list of S&P 500 price index daily

Code
# Define the ticker symbol for the S&P 500 index
ticker = '^GSPC'

# Define the start and end dates
start_date = '2013-01-01'
end_date = '2024-12-26'

# Fetch the historical data
sp500_data = yf.download(ticker, start=start_date, end=end_date, multi_level_index=False).reset_index()

# Display the data
sp500_data.head(10)
Date Close High Low Open Volume
0 2013-01-02 1462.420044 1462.430054 1426.189941 1426.189941 4202600000
1 2013-01-03 1459.369995 1465.469971 1455.530029 1462.420044 3829730000
2 2013-01-04 1466.469971 1467.939941 1458.989990 1459.369995 3424290000
3 2013-01-07 1461.890015 1466.469971 1456.619995 1466.469971 3304970000
4 2013-01-08 1457.150024 1461.890015 1451.640015 1461.890015 3601600000
5 2013-01-09 1461.020020 1464.729980 1457.150024 1457.150024 3674390000
6 2013-01-10 1472.119995 1472.300049 1461.020020 1461.020020 4081840000
7 2013-01-11 1472.050049 1472.750000 1467.579956 1472.119995 3340650000
8 2013-01-14 1470.680054 1472.050049 1465.689941 1472.050049 3003010000
9 2013-01-15 1472.339966 1473.310059 1463.760010 1470.670044 3135350000

Clean data with ibis framework. We only need to keep Date and Close columns. Since data is downloaded live, we also archive a copy of data.

It is totally okay to use Pandas to clean the data. It’s just a personal preference that I prefer the modernized and portable syntax of ibis framework.

Code
# import to duckdb backend of ibis framework
sp500_data_ibis = ibis.memtable(data=sp500_data)

sp500_data_cleaned = (
    sp500_data_ibis.select("Date", "Close")
        .mutate(Date = _.Date.date(), Close = _.Close.round(digits = 2))
)
# export a csv copy
sp500_data_cleaned.to_csv(path = os.path.join(output_dir, "sp500_close.csv")) 

# preview of data
sp500_data_cleaned
┏━━━━━━━━━━━━┳━━━━━━━━━┓
┃ Date        Close   ┃
┡━━━━━━━━━━━━╇━━━━━━━━━┩
│ datefloat64 │
├────────────┼─────────┤
│ 2013-01-021462.42 │
│ 2013-01-031459.37 │
│ 2013-01-041466.47 │
│ 2013-01-071461.89 │
│ 2013-01-081457.15 │
│ 2013-01-091461.02 │
│ 2013-01-101472.12 │
│ 2013-01-111472.05 │
│ 2013-01-141470.68 │
│ 2013-01-151472.34 │
│ 2013-01-161472.63 │
│ 2013-01-171480.94 │
│ 2013-01-181485.98 │
│ 2013-01-221492.56 │
│ 2013-01-231494.81 │
│ 2013-01-241494.82 │
│ 2013-01-251502.96 │
│ 2013-01-281500.18 │
│ 2013-01-291507.84 │
│ 2013-01-301501.96 │
│  │
└────────────┴─────────┘

Calculate Moving Average and Volatility

Code
return_window = ibis.window(preceding=30, following=0, order_by="Date")

sp500_data_transformed = (sp500_data_cleaned.mutate(Previous_Close = _.Close.lag())
    .mutate(Daily_Return = ((_.Close - _.Previous_Close)/_.Previous_Close).round(digits = 6))
    .mutate(thirty_day_vol = ibis.ifelse(
        _.Daily_Return.count().over(return_window) >= 30, 
        _.Daily_Return.std().over(return_window).round(digits = 6), 
        None))
)

sp500_vol_no_null = sp500_data_transformed.filter(_.thirty_day_vol != None)
sp500_vol_dates = sp500_vol_no_null.select("Date").mutate(index = ibis.row_number())
sp500_vol = sp500_vol_no_null.select("thirty_day_vol")

# bring to pandas dataframes to be more compatible with sklearn APIs
sp500_vol_pd = sp500_vol.to_pandas()

Train KMeans Model

Find the Optimal K, using elbow method

In the following “elbow charts”, trade off between inertia and silhouette scores, we settle at 9 clusters, as it gives a relatively high silhouette scores whil keeping a relatively low inertia.

Code
def find_best_k_for_kmeans_clustering(dataframe, scaler, kmin, kmax,  random_state=42, figheight=8, figwidth=10):
    scalermethod = scaler;
    dataframe_scaled = pd.DataFrame(data=scaler.fit_transform(dataframe[dataframe.columns]), columns=dataframe.columns);
    from sklearn.metrics import silhouette_score, silhouette_samples;
    inertias = {}
    silhouettes = {}
    for k in range(kmin, kmax):
        kmeans = KMeans(n_clusters=k, random_state=random_state).fit(dataframe_scaled)
        inertias[k] = kmeans.inertia_
        silhouettes[k] = silhouette_score(dataframe_scaled, kmeans.labels_, metric='euclidean')
    inertias_df = ibis.memtable(list(inertias.items()), columns=["cluster", "inertia"])
    silhouettes_df = ibis.memtable(list(silhouettes.items()), columns=["cluster", "silhouettes"])
    metrics_df = inertias_df.left_join(silhouettes_df, "cluster").select(~s.contains("_right"))
    return metrics_df

kmeans_metrics_df = find_best_k_for_kmeans_clustering(sp500_vol_pd, scaler=MinMaxScaler(), kmin=3, kmax=15)

kmeans_metrics_df
┏━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━┓
┃ cluster  inertia   silhouettes ┃
┡━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━┩
│ int64float64float64     │
├─────────┼──────────┼─────────────┤
│       35.6948220.651185 │
│       44.6907290.648749 │
│       52.6827290.540790 │
│       61.5797150.553520 │
│       71.0865680.571062 │
│       80.8375670.578115 │
│       90.6810400.581327 │
│      100.6059770.576432 │
│      110.4705950.577266 │
│      120.4014810.536613 │
│      130.3251400.533363 │
│      140.2633710.545676 │
└─────────┴──────────┴─────────────┘

Create “elbow chart” for inertia (sum of squared distances within each cluster).

We are looking for the number of clusters that yield the relatively lower “turning” of the line.

Code
(
    ggplot(kmeans_metrics_df, aes("cluster", "inertia"))
    + geom_line()
    + scale_x_continuous(breaks=range(3,15))
    + labs(
        x="Number of clusters, K",
        y="Inertia",
        title="K-Means, Elbow Method")
)

Create “elbow chart” for silhouette scores (measuring separation among clusters).

We are looking for the number of clusters that yield the relatively higher “turning” of the line.

Code
(
    ggplot()
    + geom_line(kmeans_metrics_df, aes("cluster", "silhouettes"))
    + scale_x_continuous(breaks=range(3,15))
    + labs(
        x="Number of clusters, K",
        y="silhouettes",
        title="K-Means, Elbow Method")
)

Based on the outputs and criteria, it seems 8 clusters are appropriate.

Train the Optimal KMeans Model and Cluster Volatility

Train the optimal KMeans model and predict the cluster label and then cluster volatilities into 9 different clusters.

Code
def predict_cluster_with_kmeans(dataframe, optimal_k, scaler, Random_State=42):
    from sklearn.cluster import KMeans;
    kmeans = KMeans(n_clusters=optimal_k, random_state=Random_State);
    dataframe_scaled = pd.DataFrame(data=scaler.fit_transform(dataframe[dataframe.columns]), columns=dataframe.columns);
    dataframe['cluster'] = kmeans.fit_predict(dataframe_scaled)
    return dataframe

sp500_vol_pred = predict_cluster_with_kmeans(dataframe=sp500_vol_pd, optimal_k=8, scaler=MinMaxScaler())

sp500_vol_pred.head(10)
thirty_day_vol cluster
0 0.004513 0
1 0.004457 0
2 0.004543 0
3 0.005142 0
4 0.005258 0
5 0.005384 6
6 0.006422 6
7 0.006374 6
8 0.006736 6
9 0.006735 6

View the descriptive stats on each cluster.

Code
sp500_vol_pred = (
    ibis.memtable(data=sp500_vol_pred)
    # adding the dates back to the volatility dataset
        .mutate(index = ibis.row_number())
        .left_join(sp500_vol_dates, "index")
        .select(~s.startswith("index"))
)

(
    sp500_vol_pred.aggregate(
            by = "cluster",
            count = _.thirty_day_vol.count(), 
            mean = _.thirty_day_vol.mean(),
            max =  _.thirty_day_vol.max(),
            median = _.thirty_day_vol.median(), 
            min = _.thirty_day_vol.min())
        .order_by(_.cluster)
)
┏━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┓
┃ cluster  count  mean      max       median    min      ┃
┡━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━┩
│ int32int64float64float64float64float64  │
├─────────┼───────┼──────────┼──────────┼──────────┼──────────┤
│       05170.0043710.0053270.0044710.002251 │
│       13490.0131440.0151480.0130460.011789 │
│       2310.0488670.0529270.0502290.041277 │
│       37040.0079310.0091600.0078560.007118 │
│       4140.0297300.0358640.0308570.023744 │
│       52290.0171680.0234090.0167610.015194 │
│       67320.0062770.0071150.0063080.005332 │
│       74100.0103690.0117630.0103530.009173 │
└─────────┴───────┴──────────┴──────────┴──────────┴──────────┘

Create the bridging table to relabel clusters.

Relabel the cluster in ascending order of mean volatility of each regime

This is used to create the transitional probability table later.

Code
sp500_vol_pred_cluster_bridge = (
    sp500_vol_pred.aggregate(
        by = _.cluster, 
        cluster_mean_vol = _.thirty_day_vol.mean())
        .order_by(_.cluster_mean_vol)
        .mutate(cluster_new = ibis.row_number())
)

sp500_vol_pred_relabelled = (
    sp500_vol_pred.left_join(sp500_vol_pred_cluster_bridge, _.cluster == sp500_vol_pred_cluster_bridge.cluster)
        .drop(["cluster", "cluster_right"])
        .rename(cluster = "cluster_new")
)

sp500_vol_pred_relabelled
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┓
┃ thirty_day_vol  Date        cluster_mean_vol  cluster ┃
┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━┩
│ float64datefloat64int64   │
├────────────────┼────────────┼──────────────────┼─────────┤
│       0.0045132013-02-140.0043710 │
│       0.0044572013-02-150.0043710 │
│       0.0045432013-02-190.0043710 │
│       0.0051422013-02-200.0043710 │
│       0.0052582013-02-210.0043710 │
│       0.0053842013-02-220.0062771 │
│       0.0064222013-02-250.0062771 │
│       0.0063742013-02-260.0062771 │
│       0.0067362013-02-270.0062771 │
│       0.0067352013-02-280.0062771 │
│       0.0067402013-03-010.0062771 │
│       0.0067682013-03-040.0062771 │
│       0.0068912013-03-050.0062771 │
│       0.0068792013-03-060.0062771 │
│       0.0068552013-03-070.0062771 │
│       0.0068812013-03-080.0062771 │
│       0.0068872013-03-110.0062771 │
│       0.0068742013-03-120.0062771 │
│       0.0068532013-03-130.0062771 │
│       0.0068942013-03-180.0062771 │
│               │
└────────────────┴────────────┴──────────────────┴─────────┘

Review the key volatility stats based on final clusters:

Code
sp500_vol_pred_relabelled_summary = (
    sp500_vol_pred_relabelled.aggregate(
            by = "cluster",
            count = _.thirty_day_vol.count(), 
            mean = _.thirty_day_vol.mean(),
            max =  _.thirty_day_vol.max(),
            median = _.thirty_day_vol.median(), 
            min = _.thirty_day_vol.min())
        .order_by(_.cluster)
)

sp500_vol_pred_relabelled_summary
┏━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┓
┃ cluster  count  mean      max       median    min      ┃
┡━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━┩
│ int64int64float64float64float64float64  │
├─────────┼───────┼──────────┼──────────┼──────────┼──────────┤
│       05170.0043710.0053270.0044710.002251 │
│       17320.0062770.0071150.0063080.005332 │
│       27040.0079310.0091600.0078560.007118 │
│       34100.0103690.0117630.0103530.009173 │
│       43490.0131440.0151480.0130460.011789 │
│       52290.0171680.0234090.0167610.015194 │
│       6140.0297300.0358640.0308570.023744 │
│       7310.0488670.0529270.0502290.041277 │
└─────────┴───────┴──────────┴──────────┴──────────┴──────────┘

Plot the Regime vs Actual thirty_day_Vol

In the following graph, you can see how actual 30-day volatility compares to (shaded in red) their major volatility cluster.

Code
sp500_vol_pred_pivoted = (
    sp500_vol_pred_relabelled.pivot_longer(col=["thirty_day_vol", "cluster_mean_vol"], names_to="Type", values_to="Volatility")
    .order_by(_.Type, _.Date)
)

(
    ggplot(sp500_vol_pred_pivoted, aes(x = "Date"))
    + geom_line(aes(y = "Volatility", color = "Type", group = 1), stat = "identity")
    + scale_color_manual(values=dict(thirty_day_vol = "blue", cluster_mean_vol = "red"))
    + scale_x_date(date_breaks="3 month", date_labels="%Y-%m")
    + labs(
        x = "Date", 
        y = "Volatility",
        title="S&P500 Volatility Regime Transitions" )
    + theme( 
        figure_size=(20, 10),
        legend_position='top',
        axis_text_x=element_text(rotation=45, hjust=1) )
)

Calculate Transition Probability

create cluster transitions counts dataframe

Code
sp500_vol_pred_shifted = (
    sp500_vol_pred.select(cluster_from = "cluster")
        .mutate(
            cluster_from = _.cluster_from.cast("String"),
            cluster_to = _.cluster_from.lead(1).cast("String"))
        # remove NAs from lead()
        .filter(_.cluster_to != None)
)
sp500_vol_pred_shifted
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
┃ cluster_from  cluster_to ┃
┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
│ stringstring     │
├──────────────┼────────────┤
│ 0           0          │
│ 0           0          │
│ 0           0          │
│ 0           0          │
│ 0           6          │
│ 6           6          │
│ 6           6          │
│ 6           6          │
│ 6           6          │
│ 6           6          │
│ 6           6          │
│ 6           6          │
│ 6           6          │
│ 6           6          │
│ 6           6          │
│ 6           6          │
│ 6           6          │
│ 6           6          │
│ 6           6          │
│ 6           6          │
│           │
└──────────────┴────────────┘
Code
# get a frequecy of each cluster to calculate relative frequency of each transition type, see below
sp500_cluster_vol_count = (
    sp500_vol_pred.aggregate(
                by = "cluster",
                count = _.thirty_day_vol.count())
    .mutate(cluster = _.cluster.cast("String"))
)

vol_transition_table = (
        # perform crosstabbing between cluster_from and cluster_to to understand count of transitions 
    sp500_vol_pred_shifted
        .mutate(counter = 1)
        .pivot_wider(names_from ="cluster_to", values_from="counter", values_agg="sum", values_fill=0, names_sort=True)
        # add cluster frequency
        .left_join(
            right = sp500_cluster_vol_count, 
            predicates= _.cluster_from == sp500_cluster_vol_count.cluster
        )
        # clean up data
        .drop(_.cluster_from)
        .order_by(_.cluster)
        .relocate(_.cluster, before="0")
)
vol_transition_table
┏━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┓
┃ cluster  0      1      2      3      4      5      6      7      count ┃
┡━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━┩
│ stringint64int64int64int64int64int64int64int64int64 │
├─────────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┼───────┤
│ 0      46650802324517 │
│ 1      230004015523349 │
│ 2      00290110031 │
│ 3      1150586117326704 │
│ 4      00101110114 │
│ 5      01414120315229 │
│ 6      351070036149732 │
│ 7      224032037342410 │
└─────────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘
Code
# calculate relative frequencies
vol_transition_freq_table = vol_transition_table.mutate(s.across(s.numeric(), func = _ / vol_transition_table['count'])).drop("count")

vol_transition_freq_table
┏━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┓
┃ cluster  0         1         2         3         4         5         6         7        ┃
┡━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━┩
│ stringfloat64float64float64float64float64float64float64float64  │
├─────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┤
│ 0      0.9013540.0096710.0000000.0154740.0000000.0038680.0618960.007737 │
│ 1      0.0057310.8595990.0000000.0114610.0000000.0429800.0143270.065903 │
│ 2      0.0000000.0000000.9354840.0000000.0322580.0322580.0000000.000000 │
│ 3      0.0156250.0071020.0000000.8323860.0014200.0014200.1036930.036932 │
│ 4      0.0000000.0000000.0714290.0000000.7857140.0714290.0000000.071429 │
│ 5      0.0000000.0611350.0043670.0174670.0043670.8864630.0043670.021834 │
│ 6      0.0478140.0013660.0000000.0956280.0000000.0040980.8387980.012295 │
│ 7      0.0048780.0585370.0000000.0780490.0000000.0073170.0170730.834146 │
└─────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┘