Chapter 19 & 20: Market Microstructure¶

AFML Ch. 19-20 -- Liquidity, price impact, and informed trading metrics.

Market microstructure features quantify the interaction between order flow, liquidity, and price formation. These metrics are critical for understanding transaction costs, detecting informed trading, and measuring market quality.

Topics covered:

Tick rule classification
VPIN (Volume-Synchronized Probability of Informed Trading)
Amihud lambda (price impact)
Kyle's lambda (order flow impact)
Roll spread estimator
Corwin-Schultz spread estimator
Hasbrouck lambda (Gibbs sampling)
Volatility estimators (Parkinson, Garman-Klass)
Polars integration

In [1]:

Copied!





import numpy as np
import matplotlib.pyplot as plt
import polars as pl
import pymlfinance
import pymlfinance.polars
from pymlfinance.polars._lib import (
    amihud_lambda as pl_amihud, kyle_lambda as pl_kyle,
    corwin_schultz_spread as pl_cs, vpin as pl_vpin,
    parkinson_volatility as pl_parkinson,
    garman_klass_volatility as pl_gk,
    yang_zhang_volatility as pl_yz,
)

%matplotlib inline
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['figure.dpi'] = 150
plt.rcParams['font.size'] = 15
plt.rcParams['axes.titlesize'] = 18
plt.rcParams['axes.labelsize'] = 15
plt.rcParams['xtick.labelsize'] = 13
plt.rcParams['ytick.labelsize'] = 13
plt.rcParams['legend.fontsize'] = 13
np.random.seed(42)
import numpy as np
import matplotlib.pyplot as plt
import polars as pl
import pymlfinance
import pymlfinance.polars
from pymlfinance.polars._lib import (
    amihud_lambda as pl_amihud, kyle_lambda as pl_kyle,
    corwin_schultz_spread as pl_cs, vpin as pl_vpin,
    parkinson_volatility as pl_parkinson,
    garman_klass_volatility as pl_gk,
    yang_zhang_volatility as pl_yz,
)

%matplotlib inline
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['figure.dpi'] = 150
plt.rcParams['font.size'] = 15
plt.rcParams['axes.titlesize'] = 18
plt.rcParams['axes.labelsize'] = 15
plt.rcParams['xtick.labelsize'] = 13
plt.rcParams['ytick.labelsize'] = 13
plt.rcParams['legend.fontsize'] = 13
np.random.seed(42)

Generate Synthetic Trade Data¶

We simulate 2,000 trades with a simple microstructure model:

An efficient price follows a random walk
Trade prices include bid-ask bounce (the observed price jumps between bid and ask)
Volumes are drawn from an exponential distribution
OHLC bars are aggregated from the tick data for spread estimators

In [2]:

Copied!





n = 2000
# Price with microstructure noise (bid-ask bounce)
efficient_price = 100.0 + np.cumsum(np.random.randn(n) * 0.05)
spread = 0.10
trade_prices = efficient_price + np.random.choice([-spread/2, spread/2], n)
volumes = np.abs(np.random.exponential(100, n))
dollar_volumes = trade_prices * volumes

# OHLC data (for spread estimators)
n_bars = 200
bar_size = n // n_bars
highs = np.array([trade_prices[i*bar_size:(i+1)*bar_size].max() for i in range(n_bars)])
lows = np.array([trade_prices[i*bar_size:(i+1)*bar_size].min() for i in range(n_bars)])
opens = np.array([trade_prices[i*bar_size] for i in range(n_bars)])
closes = np.array([trade_prices[(i+1)*bar_size - 1] for i in range(n_bars)])

print(f"Generated {n} trades, {n_bars} OHLC bars")
print(f"  Price range: {trade_prices.min():.2f} - {trade_prices.max():.2f}")
print(f"  True spread: {spread:.2f}")
n = 2000
# Price with microstructure noise (bid-ask bounce)
efficient_price = 100.0 + np.cumsum(np.random.randn(n) * 0.05)
spread = 0.10
trade_prices = efficient_price + np.random.choice([-spread/2, spread/2], n)
volumes = np.abs(np.random.exponential(100, n))
dollar_volumes = trade_prices * volumes

# OHLC data (for spread estimators)
n_bars = 200
bar_size = n // n_bars
highs = np.array([trade_prices[i*bar_size:(i+1)*bar_size].max() for i in range(n_bars)])
lows = np.array([trade_prices[i*bar_size:(i+1)*bar_size].min() for i in range(n_bars)])
opens = np.array([trade_prices[i*bar_size] for i in range(n_bars)])
closes = np.array([trade_prices[(i+1)*bar_size - 1] for i in range(n_bars)])

print(f"Generated {n} trades, {n_bars} OHLC bars")
print(f"  Price range: {trade_prices.min():.2f} - {trade_prices.max():.2f}")
print(f"  True spread: {spread:.2f}")

Generated 2000 trades, 200 OHLC bars
  Price range: 99.12 - 104.82
  True spread: 0.10

In [3]:

Copied!





fig, axes = plt.subplots(2, 1, figsize=(12, 7))

axes[0].plot(efficient_price, color="steelblue", linewidth=0.8, alpha=0.7, label="Efficient price")
axes[0].plot(trade_prices, color="gray", linewidth=0.3, alpha=0.5, label="Trade prices")
axes[0].set_ylabel("Price")
axes[0].set_title("Trade Prices vs Efficient Price")
axes[0].legend()

axes[1].bar(range(min(200, len(volumes))), volumes[:200], color="steelblue", alpha=0.6, width=1.0)
axes[1].set_xlabel("Trade index")
axes[1].set_ylabel("Volume")
axes[1].set_title("Trade Volumes (first 200)")

plt.tight_layout()
plt.show()
fig, axes = plt.subplots(2, 1, figsize=(12, 7))

axes[0].plot(efficient_price, color="steelblue", linewidth=0.8, alpha=0.7, label="Efficient price")
axes[0].plot(trade_prices, color="gray", linewidth=0.3, alpha=0.5, label="Trade prices")
axes[0].set_ylabel("Price")
axes[0].set_title("Trade Prices vs Efficient Price")
axes[0].legend()

axes[1].bar(range(min(200, len(volumes))), volumes[:200], color="steelblue", alpha=0.6, width=1.0)
axes[1].set_xlabel("Trade index")
axes[1].set_ylabel("Volume")
axes[1].set_title("Trade Volumes (first 200)")

plt.tight_layout()
plt.show()

No description has been provided for this image

Tick Rule Classification¶

The tick rule classifies each trade as buyer- or seller-initiated based on price changes:

Uptick (+1): price increased from previous trade (buyer-initiated)
Downtick (-1): price decreased (seller-initiated)
No change (0): same price as previous trade

In [4]:

Copied!





tick_signs = pymlfinance.features.tick_rule_classify(trade_prices)
print(f"--- Tick Rule Classification ---")
print(f"  Upticks (+1): {np.sum(tick_signs > 0)}")
print(f"  Downticks (-1): {np.sum(tick_signs < 0)}")
print(f"  No change (0): {np.sum(tick_signs == 0)}")
print(f"  Buy/sell imbalance: {np.mean(tick_signs):.4f}")
tick_signs = pymlfinance.features.tick_rule_classify(trade_prices)
print(f"--- Tick Rule Classification ---")
print(f"  Upticks (+1): {np.sum(tick_signs > 0)}")
print(f"  Downticks (-1): {np.sum(tick_signs < 0)}")
print(f"  No change (0): {np.sum(tick_signs == 0)}")
print(f"  Buy/sell imbalance: {np.mean(tick_signs):.4f}")

--- Tick Rule Classification ---
  Upticks (+1): 1045
  Downticks (-1): 954
  No change (0): 1
  Buy/sell imbalance: 0.0455

In [5]:

Copied!





# Tick classification pie chart
tick_counts = [np.sum(tick_signs > 0), np.sum(tick_signs < 0), np.sum(tick_signs == 0)]
tick_labels = ["Uptick (+1)", "Downtick (-1)", "No change (0)"]
tick_colors = ["#4CAF50", "#F44336", "#9E9E9E"]

fig, ax = plt.subplots(figsize=(6, 6))
wedges, texts, autotexts = ax.pie(
    tick_counts, labels=tick_labels, colors=tick_colors,
    autopct="%1.1f%%", startangle=90, textprops={"fontsize": 11}
)
ax.set_title("Tick Rule Classification Distribution")
plt.tight_layout()
plt.show()
# Tick classification pie chart
tick_counts = [np.sum(tick_signs > 0), np.sum(tick_signs < 0), np.sum(tick_signs == 0)]
tick_labels = ["Uptick (+1)", "Downtick (-1)", "No change (0)"]
tick_colors = ["#4CAF50", "#F44336", "#9E9E9E"]

fig, ax = plt.subplots(figsize=(6, 6))
wedges, texts, autotexts = ax.pie(
    tick_counts, labels=tick_labels, colors=tick_colors,
    autopct="%1.1f%%", startangle=90, textprops={"fontsize": 11}
)
ax.set_title("Tick Rule Classification Distribution")
plt.tight_layout()
plt.show()

VPIN (Volume-Synchronized Probability of Informed Trading)¶

VPIN estimates the fraction of trading volume that comes from informed traders. It partitions trading activity into equal-volume buckets and measures the buy/sell imbalance within each bucket. Higher VPIN values indicate more informed trading (potential adverse selection).

In [6]:

Copied!





vpin_values = pymlfinance.features.vpin(
    volumes=volumes, prices=trade_prices,
    bucket_size=float(np.sum(volumes) / 100),  # ~100 buckets
    n_buckets=10
)
print(f"--- VPIN ---")
print(f"  VPIN series length: {len(vpin_values)}")
if len(vpin_values) > 0:
    print(f"  Mean VPIN: {np.mean(vpin_values):.4f}")
    print(f"  Max VPIN:  {np.max(vpin_values):.4f}")
    print(f"  Min VPIN:  {np.min(vpin_values):.4f}")
vpin_values = pymlfinance.features.vpin(
    volumes=volumes, prices=trade_prices,
    bucket_size=float(np.sum(volumes) / 100),  # ~100 buckets
    n_buckets=10
)
print(f"--- VPIN ---")
print(f"  VPIN series length: {len(vpin_values)}")
if len(vpin_values) > 0:
    print(f"  Mean VPIN: {np.mean(vpin_values):.4f}")
    print(f"  Max VPIN:  {np.max(vpin_values):.4f}")
    print(f"  Min VPIN:  {np.min(vpin_values):.4f}")

--- VPIN ---
  VPIN series length: 86
  Mean VPIN: 0.2487
  Max VPIN:  0.3856
  Min VPIN:  0.0894

In [7]:

Copied!





if len(vpin_values) > 0:
    fig, ax = plt.subplots(figsize=(12, 5))
    ax.plot(vpin_values, color="darkred", linewidth=1)
    ax.axhline(y=np.mean(vpin_values), color="gray", linestyle="--", alpha=0.7,
               label=f"Mean VPIN ({np.mean(vpin_values):.3f})")
    ax.fill_between(range(len(vpin_values)), vpin_values, alpha=0.2, color="darkred")
    ax.set_xlabel("Bucket index")
    ax.set_ylabel("VPIN")
    ax.set_title("VPIN Over Time")
    ax.legend()
    ax.set_ylim(0, 1)
    plt.tight_layout()
    plt.show()
if len(vpin_values) > 0:
    fig, ax = plt.subplots(figsize=(12, 5))
    ax.plot(vpin_values, color="darkred", linewidth=1)
    ax.axhline(y=np.mean(vpin_values), color="gray", linestyle="--", alpha=0.7,
               label=f"Mean VPIN ({np.mean(vpin_values):.3f})")
    ax.fill_between(range(len(vpin_values)), vpin_values, alpha=0.2, color="darkred")
    ax.set_xlabel("Bucket index")
    ax.set_ylabel("VPIN")
    ax.set_title("VPIN Over Time")
    ax.legend()
    ax.set_ylim(0, 1)
    plt.tight_layout()
    plt.show()

Price Impact: Amihud Lambda¶

Amihud's illiquidity measure (lambda) captures the average price impact per unit of dollar volume traded. Higher values indicate less liquid markets where trading has a larger impact on prices.

$$\lambda_{Amihud} = \frac{1}{N} \sum_t \frac{|r_t|}{V_t^\$}$$

In [8]:

Copied!





returns = pymlfinance.core.log_returns(trade_prices)
amihud = pymlfinance.features.amihud_lambda(returns, dollar_volumes[1:])
print(f"--- Amihud Lambda (Price Impact) ---")
print(f"  Amihud lambda: {amihud:.8f}")
print(f"  (Higher = less liquid / greater price impact)")

# Rolling Amihud
amihud_rolling = pymlfinance.features.amihud_lambda_rolling(returns, dollar_volumes[1:], window=100)
print(f"  Rolling Amihud (window=100): mean={np.nanmean(amihud_rolling):.8f}")
returns = pymlfinance.core.log_returns(trade_prices)
amihud = pymlfinance.features.amihud_lambda(returns, dollar_volumes[1:])
print(f"--- Amihud Lambda (Price Impact) ---")
print(f"  Amihud lambda: {amihud:.8f}")
print(f"  (Higher = less liquid / greater price impact)")

# Rolling Amihud
amihud_rolling = pymlfinance.features.amihud_lambda_rolling(returns, dollar_volumes[1:], window=100)
print(f"  Rolling Amihud (window=100): mean={np.nanmean(amihud_rolling):.8f}")

--- Amihud Lambda (Price Impact) ---
  Amihud lambda: 0.00000322
  (Higher = less liquid / greater price impact)
  Rolling Amihud (window=100): mean=0.00000337

Kyle's Lambda¶

Kyle's lambda measures the permanent price impact of order flow. It is estimated from the regression of returns on signed (buy/sell) volume. Higher lambda implies that trades carry more information about fundamental value.

In [9]:

Copied!





signed_vol = tick_signs[1:].astype(np.float64) * volumes[1:]
kyle = pymlfinance.features.kyle_lambda(returns, signed_vol)
print(f"--- Kyle Lambda ---")
print(f"  Kyle lambda: {kyle:.8f}")
print(f"  (Permanent price impact per unit signed volume)")
signed_vol = tick_signs[1:].astype(np.float64) * volumes[1:]
kyle = pymlfinance.features.kyle_lambda(returns, signed_vol)
print(f"--- Kyle Lambda ---")
print(f"  Kyle lambda: {kyle:.8f}")
print(f"  (Permanent price impact per unit signed volume)")

--- Kyle Lambda ---
  Kyle lambda: 0.00000345
  (Permanent price impact per unit signed volume)

Spread Estimators¶

Spread estimators infer the bid-ask spread from observed prices without needing the actual order book. We compare two approaches:

Roll spread: Uses the negative autocovariance of returns (works with close prices)
Corwin-Schultz spread: Uses high-low price ratios across adjacent bars

In [10]:

Copied!





roll = pymlfinance.features.roll_spread(closes)
print(f"--- Roll Spread Estimator ---")
print(f"  Estimated spread: {roll:.4f}")
print(f"  True spread: {spread:.4f}")
print(f"  Error: {abs(roll - spread):.4f}")

# Rolling Roll
roll_rolling = pymlfinance.features.roll_spread_rolling(closes, window=20)
print(f"  Rolling mean: {np.nanmean(roll_rolling):.4f}")
roll = pymlfinance.features.roll_spread(closes)
print(f"--- Roll Spread Estimator ---")
print(f"  Estimated spread: {roll:.4f}")
print(f"  True spread: {spread:.4f}")
print(f"  Error: {abs(roll - spread):.4f}")

# Rolling Roll
roll_rolling = pymlfinance.features.roll_spread_rolling(closes, window=20)
print(f"  Rolling mean: {np.nanmean(roll_rolling):.4f}")

--- Roll Spread Estimator ---
  Estimated spread: 0.0968
  True spread: 0.1000
  Error: 0.0032
  Rolling mean: 0.1080

In [11]:

Copied!





cs_spread = pymlfinance.features.corwin_schultz_spread(highs, lows)
print(f"--- Corwin-Schultz Spread ---")
cs_valid = cs_spread[~np.isnan(cs_spread)]
if len(cs_valid) > 0:
    print(f"  Mean spread: {np.mean(cs_valid):.4f}")
    print(f"  Median spread: {np.median(cs_valid):.4f}")
cs_spread = pymlfinance.features.corwin_schultz_spread(highs, lows)
print(f"--- Corwin-Schultz Spread ---")
cs_valid = cs_spread[~np.isnan(cs_spread)]
if len(cs_valid) > 0:
    print(f"  Mean spread: {np.mean(cs_valid):.4f}")
    print(f"  Median spread: {np.median(cs_valid):.4f}")

--- Corwin-Schultz Spread ---
  Mean spread: 0.0006
  Median spread: 0.0004

In [12]:

Copied!





fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Spread estimator comparison bar chart
spread_names = ["True Spread", "Roll", "Corwin-Schultz"]
spread_values = [spread, roll, np.nanmean(cs_valid) if len(cs_valid) > 0 else 0]
bar_colors = ["steelblue", "darkgreen", "darkred"]
axes[0].bar(spread_names, spread_values, color=bar_colors, alpha=0.8)
axes[0].axhline(y=spread, color="gray", linestyle="--", alpha=0.5)
axes[0].set_ylabel("Spread")
axes[0].set_title("Spread Estimator Comparison")

# Rolling Roll spread over time
valid_mask = ~np.isnan(roll_rolling)
roll_x = np.arange(len(roll_rolling))
axes[1].plot(roll_x[valid_mask], roll_rolling[valid_mask], color="darkgreen", linewidth=1)
axes[1].axhline(y=spread, color="red", linestyle="--", alpha=0.7, label=f"True spread ({spread})")
axes[1].set_xlabel("Bar index")
axes[1].set_ylabel("Rolling Roll spread")
axes[1].set_title("Rolling Roll Spread (window=20)")
axes[1].legend()

plt.tight_layout()
plt.show()
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Spread estimator comparison bar chart
spread_names = ["True Spread", "Roll", "Corwin-Schultz"]
spread_values = [spread, roll, np.nanmean(cs_valid) if len(cs_valid) > 0 else 0]
bar_colors = ["steelblue", "darkgreen", "darkred"]
axes[0].bar(spread_names, spread_values, color=bar_colors, alpha=0.8)
axes[0].axhline(y=spread, color="gray", linestyle="--", alpha=0.5)
axes[0].set_ylabel("Spread")
axes[0].set_title("Spread Estimator Comparison")

# Rolling Roll spread over time
valid_mask = ~np.isnan(roll_rolling)
roll_x = np.arange(len(roll_rolling))
axes[1].plot(roll_x[valid_mask], roll_rolling[valid_mask], color="darkgreen", linewidth=1)
axes[1].axhline(y=spread, color="red", linestyle="--", alpha=0.7, label=f"True spread ({spread})")
axes[1].set_xlabel("Bar index")
axes[1].set_ylabel("Rolling Roll spread")
axes[1].set_title("Rolling Roll Spread (window=20)")
axes[1].legend()

plt.tight_layout()
plt.show()

Hasbrouck Lambda (Gibbs Sampling)¶

Hasbrouck's lambda decomposes price changes into permanent (information) and transitory (noise) components using a Bayesian Gibbs sampling approach. This provides a more robust estimate of information-driven price impact.

In [13]:

Copied!





trade_signs = tick_signs[1:].astype(np.float64)
# Replace zeros with previous sign
for i in range(1, len(trade_signs)):
    if trade_signs[i] == 0:
        trade_signs[i] = trade_signs[i-1]
hasbrouck = pymlfinance.features.hasbrouck_lambda(returns, trade_signs, n_iterations=100, seed=42)
print(f"--- Hasbrouck Lambda (Gibbs Sampling) ---")
print(f"  Hasbrouck lambda: {hasbrouck:.8f}")
trade_signs = tick_signs[1:].astype(np.float64)
# Replace zeros with previous sign
for i in range(1, len(trade_signs)):
    if trade_signs[i] == 0:
        trade_signs[i] = trade_signs[i-1]
hasbrouck = pymlfinance.features.hasbrouck_lambda(returns, trade_signs, n_iterations=100, seed=42)
print(f"--- Hasbrouck Lambda (Gibbs Sampling) ---")
print(f"  Hasbrouck lambda: {hasbrouck:.8f}")

--- Hasbrouck Lambda (Gibbs Sampling) ---
  Hasbrouck lambda: 0.00067728

Polars API¶

All microstructure metrics are available through the Polars expression API. Single-column functions use the .ml namespace, while multi-column functions (like volatility estimators) are called as standalone expressions.

In [14]:

Copied!





# Tick-level Polars
tick_df = pl.DataFrame({
    "price": trade_prices,
    "volume": volumes,
})
tick_result = tick_df.with_columns(
    pl.col("price").ml.tick_rule_classify().alias("tick_sign"),
)
print(f"  Tick signs (first 5): {tick_result['tick_sign'].head(5).to_list()}")
# Tick-level Polars
tick_df = pl.DataFrame({
    "price": trade_prices,
    "volume": volumes,
})
tick_result = tick_df.with_columns(
    pl.col("price").ml.tick_rule_classify().alias("tick_sign"),
)
print(f"  Tick signs (first 5): {tick_result['tick_sign'].head(5).to_list()}")

  Tick signs (first 5): [0.0, -1.0, 1.0, -1.0, -1.0]

In [ ]:

Copied!





# Bar-level Polars
bar_returns = np.concatenate([[np.nan], pymlfinance.core.log_returns(closes)])
bar_dollar_vol = closes * volumes[:n_bars]
bar_df = pl.DataFrame({
    "open": opens, "high": highs, "low": lows, "close": closes,
    "returns": bar_returns,
    "dollar_volume": bar_dollar_vol,
})

# Multi-column functions
parkinson = bar_df.with_columns(
    pl_parkinson(pl.col("high"), pl.col("low"), window=20).alias("parkinson_vol"),
)
park_non_nan = parkinson["parkinson_vol"].to_numpy()
park_non_nan = park_non_nan[~np.isnan(park_non_nan)]
print(f"  Parkinson vol (first 3 valid): "
      f"{park_non_nan[:3].tolist()}")

gk = bar_df.with_columns(
    pl_gk(pl.col("open"), pl.col("high"), pl.col("low"), pl.col("close"), window=20).alias("gk_vol"),
)
gk_non_nan = gk["gk_vol"].to_numpy()
gk_non_nan = gk_non_nan[~np.isnan(gk_non_nan)]
print(f"  Garman-Klass vol (first 3 valid): "
      f"{gk_non_nan[:3].tolist()}")
# Bar-level Polars
bar_returns = np.concatenate([[np.nan], pymlfinance.core.log_returns(closes)])
bar_dollar_vol = closes * volumes[:n_bars]
bar_df = pl.DataFrame({
    "open": opens, "high": highs, "low": lows, "close": closes,
    "returns": bar_returns,
    "dollar_volume": bar_dollar_vol,
})

# Multi-column functions
parkinson = bar_df.with_columns(
    pl_parkinson(pl.col("high"), pl.col("low"), window=20).alias("parkinson_vol"),
)
park_non_nan = parkinson["parkinson_vol"].to_numpy()
park_non_nan = park_non_nan[~np.isnan(park_non_nan)]
print(f"  Parkinson vol (first 3 valid): "
      f"{park_non_nan[:3].tolist()}")

gk = bar_df.with_columns(
    pl_gk(pl.col("open"), pl.col("high"), pl.col("low"), pl.col("close"), window=20).alias("gk_vol"),
)
gk_non_nan = gk["gk_vol"].to_numpy()
gk_non_nan = gk_non_nan[~np.isnan(gk_non_nan)]
print(f"  Garman-Klass vol (first 3 valid): "
      f"{gk_non_nan[:3].tolist()}")

In [16]:

Copied!





# Volatility estimator comparison
park_vals = parkinson["parkinson_vol"].drop_nulls().to_numpy()
gk_vals = gk["gk_vol"].drop_nulls().to_numpy()

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Time series comparison
axes[0].plot(park_vals, color="steelblue", linewidth=1, alpha=0.8, label="Parkinson")
axes[0].plot(gk_vals, color="darkred", linewidth=1, alpha=0.8, label="Garman-Klass")
axes[0].set_xlabel("Bar index")
axes[0].set_ylabel("Volatility estimate")
axes[0].set_title("Rolling Volatility Estimators (window=20)")
axes[0].legend()

# Bar chart of means
vol_names = ["Parkinson", "Garman-Klass"]
vol_means = [np.nanmean(park_vals), np.nanmean(gk_vals)]
axes[1].bar(vol_names, vol_means, color=["steelblue", "darkred"], alpha=0.8)
axes[1].set_ylabel("Mean volatility")
axes[1].set_title("Average Volatility Estimates")

plt.suptitle("Volatility Estimator Comparison", fontsize=15, fontweight="bold")
plt.tight_layout()
plt.show()
# Volatility estimator comparison
park_vals = parkinson["parkinson_vol"].drop_nulls().to_numpy()
gk_vals = gk["gk_vol"].drop_nulls().to_numpy()

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Time series comparison
axes[0].plot(park_vals, color="steelblue", linewidth=1, alpha=0.8, label="Parkinson")
axes[0].plot(gk_vals, color="darkred", linewidth=1, alpha=0.8, label="Garman-Klass")
axes[0].set_xlabel("Bar index")
axes[0].set_ylabel("Volatility estimate")
axes[0].set_title("Rolling Volatility Estimators (window=20)")
axes[0].legend()

# Bar chart of means
vol_names = ["Parkinson", "Garman-Klass"]
vol_means = [np.nanmean(park_vals), np.nanmean(gk_vals)]
axes[1].bar(vol_names, vol_means, color=["steelblue", "darkred"], alpha=0.8)
axes[1].set_ylabel("Mean volatility")
axes[1].set_title("Average Volatility Estimates")

plt.suptitle("Volatility Estimator Comparison", fontsize=15, fontweight="bold")
plt.tight_layout()
plt.show()

Exercises¶

Spread tracking -- Increase the true spread (e.g., to 0.20 or 0.50) and verify that the Roll and Corwin-Schultz estimators track it. How accurate are they at different spread levels?
Informed trading -- Add directional volume (e.g., make 70% of trades buyer-initiated during a particular period) and observe whether VPIN increases in that region.
Liquidity regimes -- Compare Amihud and Kyle lambdas under different liquidity conditions by varying the volume distribution (e.g., high vs low average volume periods).