In [1]:
"""
Operations Analytics
UberX vs Lyft Analysis + Flashion Case Study (Rue La La)

Dependencies: pandas, numpy, scipy, scikit-learn, matplotlib
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from scipy import stats
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans

# ─────────────────────────────────────────────────────────────
#  Helper: OLS via numpy (no statsmodels needed)
# ─────────────────────────────────────────────────────────────
def ols_fit(X, y):
    """Return (beta, R², Adj-R², AIC, BIC)."""
    n, p   = X.shape
    beta   = np.linalg.lstsq(X, y, rcond=None)[0]
    y_hat  = X @ beta
    resid  = y - y_hat
    ss_res = float(resid @ resid)
    ss_tot = float(((y - y.mean()) ** 2).sum())
    r2     = 1 - ss_res / ss_tot
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p)
    sigma2 = ss_res / n
    ll     = -0.5 * n * (np.log(2 * np.pi * sigma2) + 1)
    aic    = -2 * ll + 2 * p
    bic    = -2 * ll + p * np.log(n)
    return beta, r2, adj_r2, aic, bic

def build_dummies(series, name=None, drop_first=True):
    s = series.copy()
    if name:
        s.name = name
    return pd.get_dummies(s, prefix=str(s.name), drop_first=drop_first).astype(float)

# ─────────────────────────────────────────────────────────────
#  LOAD DATA
# ─────────────────────────────────────────────────────────────
df_full = pd.read_csv("data/uber_lyft_dataset1.csv").dropna()
print(f"Dataset: {df_full.shape}")

uber = df_full[df_full['name'] == 'UberX'].copy().reset_index(drop=True)
lyft = df_full[df_full['name'] == 'Lyft'].copy().reset_index(drop=True)

print(f"UberX: {len(uber)} rows  |  Lyft: {len(lyft)} rows")

# ═════════════════════════════════════════════════════════════
# Q1 — ONE-WAY ANOVA
# ═════════════════════════════════════════════════════════════
print("\n" + "="*65)
print("Q1 — ONE-WAY ANOVA: Price ~ Source  AND  Price ~ Hour")
print("="*65)

def one_way_anova(data, group_col, platform='All'):
    groups = [g['price'].values for _, g in data.groupby(group_col)]
    F, p   = stats.f_oneway(*groups)
    sig    = "SIGNIFICANT ✓" if p < 0.05 else "not significant"
    print(f"  [{platform:5s}] by {group_col:<8s}  F={F:8.3f}  p={p:.3e}  → {sig}")
    return F, p

for label, sub in [("UberX", uber), ("Lyft", lyft), ("Both", df_full)]:
    one_way_anova(sub, 'source', label)
for label, sub in [("UberX", uber), ("Lyft", lyft), ("Both", df_full)]:
    one_way_anova(sub, 'hour', label)

fig, axes = plt.subplots(2, 2, figsize=(14, 9))
fig.suptitle("Q1 — Mean Price by Source and Hour", fontsize=14, fontweight='bold')
for idx, (platform, subset, color) in enumerate([
        ("UberX", uber, 'steelblue'), ("Lyft", lyft, 'tomato')]):
    ax = axes[idx][0]
    grp = subset.groupby('source')['price'].mean().sort_values(ascending=False)
    ax.bar(range(len(grp)), grp.values, color=color, alpha=0.85)
    ax.set_xticks(range(len(grp)))
    ax.set_xticklabels(grp.index, rotation=40, ha='right', fontsize=8)
    ax.set_title(f"{platform}: Mean Price by Source"); ax.set_ylabel("Mean Price ($)")
    ax.grid(axis='y', alpha=0.4)
    ax = axes[idx][1]
    grp_h = subset.groupby('hour')['price'].mean()
    ax.plot(grp_h.index, grp_h.values, marker='o', color=color)
    ax.set_title(f"{platform}: Mean Price by Hour")
    ax.set_xlabel("Hour"); ax.set_ylabel("Mean Price ($)"); ax.grid(alpha=0.4)
plt.tight_layout()
plt.savefig("Q1_ANOVA.png", dpi=120, bbox_inches='tight')
plt.show()
print("[Saved: Q1_ANOVA.png]")

# ═════════════════════════════════════════════════════════════
# Q2 — SIMPLE OLS: Price ~ Distance
# ═════════════════════════════════════════════════════════════
print("\n" + "="*65)
print("Q2 — OLS: Price ~ Distance")
print("="*65)

models_q2 = {}
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
fig.suptitle("Q2 — Price vs Distance (Simple OLS)", fontsize=13, fontweight='bold')
for ax, (platform, subset, color) in zip(axes, [
        ("UberX", uber, 'steelblue'), ("Lyft", lyft, 'tomato')]):
    y = subset['price'].values
    X = np.column_stack([np.ones(len(subset)), subset['distance'].values])
    beta, r2, adj_r2, aic, bic = ols_fit(X, y)
    models_q2[platform] = dict(r2=r2, adj_r2=adj_r2, aic=aic, bic=bic, n_params=2)
    print(f"{platform}: Price = {beta[0]:.3f} + {beta[1]:.3f}×dist | R²={r2:.4f}, AIC={aic:.1f}")
    x_r = np.linspace(subset['distance'].min(), subset['distance'].max(), 200)
    ax.scatter(subset['distance'], subset['price'], alpha=0.05, s=5, color=color)
    ax.plot(x_r, beta[0]+beta[1]*x_r, 'k-', lw=2,
            label=f'β₀={beta[0]:.2f}, β₁={beta[1]:.2f}')
    ax.set_title(f"{platform}  R²={r2:.4f}")
    ax.set_xlabel("Distance (miles)"); ax.set_ylabel("Price ($)")
    ax.legend(fontsize=8); ax.grid(alpha=0.4)
plt.tight_layout()
plt.savefig("Q2_BasicRegression.png", dpi=120, bbox_inches='tight')
plt.show()
print("[Saved: Q2_BasicRegression.png]")

# ═════════════════════════════════════════════════════════════
# Q3 — OLS: distance + source_dummies + hour_dummies
# ═════════════════════════════════════════════════════════════
print("\n" + "="*65)
print("Q3 — OLS: Price ~ distance + source_dummies + hour_dummies")
print("="*65)
models_q3 = {}
for platform, subset in [("UberX", uber), ("Lyft", lyft)]:
    y        = subset['price'].values
    src_dum  = build_dummies(subset['source'])
    hour_dum = build_dummies(subset['hour'].astype(str).rename('hour'))
    X = np.column_stack([np.ones(len(subset)), subset['distance'].values,
                         src_dum.values, hour_dum.values])
    beta, r2, adj_r2, aic, bic = ols_fit(X, y)
    models_q3[platform] = dict(r2=r2, adj_r2=adj_r2, aic=aic, bic=bic, n_params=X.shape[1])
    print(f"{platform}: R²={r2:.4f}, Adj-R²={adj_r2:.4f}, AIC={aic:.1f}, params={X.shape[1]}")

# ═════════════════════════════════════════════════════════════
# Q4 — OLS: distance + distance×source + distance×hour
# ═════════════════════════════════════════════════════════════
print("\n" + "="*65)
print("Q4 — OLS: Interaction Terms Only (no standalone dummies)")
print("="*65)
models_q4 = {}
for platform, subset in [("UberX", uber), ("Lyft", lyft)]:
    y        = subset['price'].values
    dist     = subset['distance'].values
    src_dum  = build_dummies(subset['source'])
    hour_dum = build_dummies(subset['hour'].astype(str).rename('hour'))
    dist_src  = src_dum.multiply(dist, axis=0)
    dist_hour = hour_dum.multiply(dist, axis=0)
    X = np.column_stack([np.ones(len(subset)), dist,
                         dist_src.values, dist_hour.values])
    beta, r2, adj_r2, aic, bic = ols_fit(X, y)
    models_q4[platform] = dict(r2=r2, adj_r2=adj_r2, aic=aic, bic=bic, n_params=X.shape[1])
    print(f"{platform}: R²={r2:.4f}, Adj-R²={adj_r2:.4f}, AIC={aic:.1f}, params={X.shape[1]}")

# ═════════════════════════════════════════════════════════════
# Q5 — FULL MODEL: Dummies + Interactions
# ═════════════════════════════════════════════════════════════
print("\n" + "="*65)
print("Q5 — Full Model: Dummies + Interaction Terms")
print("="*65)
models_q5 = {}
for platform, subset in [("UberX", uber), ("Lyft", lyft)]:
    y        = subset['price'].values
    dist     = subset['distance'].values
    src_dum  = build_dummies(subset['source'])
    hour_dum = build_dummies(subset['hour'].astype(str).rename('hour'))
    dist_src  = src_dum.multiply(dist, axis=0)
    dist_hour = hour_dum.multiply(dist, axis=0)
    X = np.column_stack([np.ones(len(subset)), dist,
                         src_dum.values, hour_dum.values,
                         dist_src.values, dist_hour.values])
    beta, r2, adj_r2, aic, bic = ols_fit(X, y)
    models_q5[platform] = dict(r2=r2, adj_r2=adj_r2, aic=aic, bic=bic, n_params=X.shape[1])
    print(f"{platform}: R²={r2:.4f}, Adj-R²={adj_r2:.4f}, AIC={aic:.1f}, params={X.shape[1]}")

# ═════════════════════════════════════════════════════════════
# Q6 — MODEL COMPARISON + DISCUSSION
# ═════════════════════════════════════════════════════════════
print("\n" + "="*65)
print("Q6 — Model Comparison Table")
print("="*65)
rows = []
label_list = ["Q2: Basic","Q3: +Dummies","Q4: +Interactions","Q5: Full"]
for platform in ["UberX", "Lyft"]:
    for label, md in zip(label_list, [models_q2, models_q3, models_q4, models_q5]):
        m = md[platform]
        rows.append({"Platform":platform,"Model":label,
                     "R²":round(m['r2'],4),"Adj-R²":round(m['adj_r2'],4),
                     "AIC":round(m['aic'],1),"# Params":m['n_params']})
comp_df = pd.DataFrame(rows)
print(comp_df.to_string(index=False))

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle("Q6 — Model Fit Comparison", fontsize=13, fontweight='bold')
cb = ['#4C72B0','#DD8452','#55A868','#C44E52']
for ax, platform in zip(axes, ["UberX", "Lyft"]):
    sub = comp_df[comp_df["Platform"] == platform]
    x   = np.arange(4)
    ax.bar(x-0.18, sub["R²"].values,    0.36, color=cb, alpha=0.9, label="R²")
    ax.bar(x+0.18, sub["Adj-R²"].values, 0.36, color=cb, alpha=0.5, hatch='//', label="Adj-R²")
    ax.set_xticks(x)
    ax.set_xticklabels(["Q2\nBasic","Q3\n+Dummies","Q4\n+Interact.","Q5\nFull"], fontsize=9)
    ax.set_title(platform); ax.set_ylabel("Score"); ax.set_ylim(0,1)
    ax.legend(); ax.grid(axis='y', alpha=0.4)
plt.tight_layout()
plt.savefig("Q6_ModelComparison.png", dpi=120, bbox_inches='tight')
plt.show()
print("[Saved: Q6_ModelComparison.png]")

print("""
Q6 Discussion:
  Q2 — Basic (distance only):
       Lowest R². Distance alone poorly predicts price because zone-level and
       time-of-day surcharges dominate.

  Q3 — + Dummy variables (source & hour):
       Significant R² jump. Adding source dummies captures neighbourhood-based
       pricing zones; hour dummies capture surge/off-peak effects. Both variables
       shift the intercept (base fare level).

  Q4 — + Interaction terms only:
       Interactions allow the price-per-mile RATE to vary by location and hour
       (slope shifts). R² is comparable to Q3, suggesting slope variation is
       nearly as important as level variation.

  Q5 — Full model (dummies + interactions):
       Best R² and Adj-R² — captures both intercept and slope variation.
       Despite higher parameter count, AIC confirms superior fit.

  UberX vs Lyft:
       Lyft shows a stronger distance coefficient and different source-level
       premiums, reflecting distinct pricing algorithms despite serving the
       same market.
""")

# ═════════════════════════════════════════════════════════════
# FLASHION CASE STUDY — Rue La La
# ═════════════════════════════════════════════════════════════
print("\n" + "="*65)
print("FLASHION CASE STUDY — Rue La La Demand Estimation")
print("="*65)

flash_df = pd.read_excel("data/Case6-Data.xlsx", sheet_name="Data")
print(f"Loaded Case6-Data.xlsx: {flash_df.shape}")

hour_columns = [c for c in flash_df.columns if "hour" in c]

# Feature engineering
flash_df["cumulative_sales"] = flash_df[hour_columns].cumsum(axis=1).iloc[:, -1]
flash_df["Stockout"] = np.isclose(flash_df["cumulative_sales"], 1, atol=1e-6).astype(int)
mask_so = flash_df["Stockout"] == 1
flash_df.loc[mask_so, "Stockout_time"] = (
    flash_df.loc[mask_so, hour_columns].cumsum(axis=1).ge(1).idxmax(axis=1)
)
flash_df.loc[~mask_so, "Stockout_time"] = np.nan

stockout_items     = flash_df[flash_df["Stockout"]==1][hour_columns].copy()
not_stockout_items = flash_df[flash_df["Stockout"]==0][hour_columns].copy()
print(f"Stockout: {len(stockout_items)}  |  Non-stockout: {len(not_stockout_items)}")

# ── KMeans for k = 2, 4 ────────────────────────────────────
k_values = [2, 4]
cluster_centers_dict = {}
palette = ['#1f77b4','#ff7f0e','#2ca02c','#d62728']
hours   = np.arange(1, 25)

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle("Flashion — Cluster Sales Trends (Non-Stockout Items)",
             fontsize=13, fontweight='bold')

for col_idx, k in enumerate(k_values):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=50)
    kmeans.fit(not_stockout_items[hour_columns])
    not_stockout_items[f"Label_k{k}"] = kmeans.labels_
    cluster_centers_dict[k] = kmeans.cluster_centers_
    centers = kmeans.cluster_centers_

    print(f"\nK={k} — Cluster sizes:")
    print(not_stockout_items[f"Label_k{k}"].value_counts().to_string())
    print(f"Cluster centers:\n{centers}")

    # Incremental ΔFraction per hour
    ax = axes[0][col_idx]
    for j in range(k):
        diff = np.diff(np.concatenate([[0], centers[j]]))
        ax.plot(hours, diff, marker='o', ms=3, color=palette[j], label=f'Cluster {j+1}')
    ax.axhline(0, color='gray', lw=0.8, ls='--')
    ax.set_title(f"k={k}: ΔFraction per Hour"); ax.set_xlabel("Hour")
    ax.set_ylabel("ΔFraction"); ax.legend(fontsize=8); ax.grid(alpha=0.4)

    # Cumulative fraction
    ax = axes[1][col_idx]
    for j in range(k):
        ax.plot(hours, np.cumsum(centers[j]), marker='o', ms=3,
                color=palette[j], label=f'Cluster {j+1}')
    ax.axhline(1.0, color='red', ls='--', lw=1, alpha=0.6, label='Stockout line')
    ax.set_title(f"k={k}: Cumulative Fraction Sold"); ax.set_xlabel("Hour")
    ax.set_ylabel("Cumulative Fraction"); ax.set_ylim(0, 1.1)
    ax.legend(fontsize=8); ax.grid(alpha=0.4)

plt.tight_layout()
plt.savefig("Flashion_Clusters.png", dpi=120, bbox_inches='tight')
plt.show()
print("[Saved: Flashion_Clusters.png]")

# ── Assign stockout items to nearest cluster ───────────────
for k in k_values:
    dists = cdist(stockout_items[hour_columns], cluster_centers_dict[k], metric='euclidean')
    flash_df.loc[flash_df["Stockout"]==1, f"Cluster_k{k}"] = dists.argmin(axis=1)

# ── Estimate lost demand & true demand ────────────────────
for k in k_values:
    lost_list, est_list = [], []
    for i, row in flash_df[flash_df["Stockout"]==1].iterrows():
        cluster = row.get(f"Cluster_k{k}")
        st_str  = row["Stockout_time"]
        if pd.isna(cluster) or pd.isna(st_str):
            lost_list.append(np.nan); est_list.append(np.nan); continue
        st_hr  = int(str(st_str).replace("hour ", ""))
        lost   = cluster_centers_dict[k][int(cluster)][st_hr:].sum()
        est    = row["Total sales"] / (1 - lost) if lost < 1 else np.nan
        lost_list.append(lost); est_list.append(est)
    flash_df.loc[flash_df["Stockout"]==1, f"Lost_Pct_k{k}"]  = lost_list
    flash_df.loc[flash_df["Stockout"]==1, f"Est_Demand_k{k}"] = est_list

# ── Results ────────────────────────────────────────────────
result = flash_df[flash_df["Stockout"]==1][[
    "Item#","Total sales","Stockout_time",
    "Cluster_k2","Lost_Pct_k2","Est_Demand_k2",
    "Cluster_k4","Lost_Pct_k4","Est_Demand_k4"]]
print("\nSample estimates (first 10):")
print(result.head(10).to_string(index=False))
print(f"\nMean estimated demand  k=2: {result['Est_Demand_k2'].mean():.0f}")
print(f"Mean estimated demand  k=4: {result['Est_Demand_k4'].mean():.0f}")

# ── Scatter chart ──────────────────────────────────────────
fig, axes = plt.subplots(1, 2, figsize=(13, 5))
fig.suptitle("Flashion — Estimated True Demand vs Actual Sales",
             fontsize=13, fontweight='bold')
for ax, k, color in zip(axes, [2,4], ['steelblue','darkorange']):
    col = f"Est_Demand_k{k}"
    sub = result.dropna(subset=[col])
    ax.scatter(sub["Total sales"], sub[col], alpha=0.35, s=10, color=color)
    lim = max(sub["Total sales"].max(), sub[col].max()) * 1.05
    ax.plot([0,lim],[0,lim],'k--',lw=1,label='y=x (zero lost sales)')
    ax.set_xlabel("Actual Sales"); ax.set_ylabel("Estimated True Demand")
    ax.set_title(f"k={k}  (n={len(sub)})"); ax.legend(); ax.grid(alpha=0.4)
plt.tight_layout()
plt.savefig("Flashion_EstVsActual.png", dpi=120, bbox_inches='tight')
plt.show()
print("[Saved: Flashion_EstVsActual.png]")

result.to_csv("stockout_demand_estimates.csv", index=False)
print("[Saved: stockout_demand_estimates.csv]")

print("""
Flashion Pattern Discussion:

For k=2:
  • Cluster 1 — "Early-rush": High fraction sold in hour 1, sharp drop after.
    These items attract buyers immediately when the event opens.
  • Cluster 2 — "Mid-event": Spike around hours 13-14, suggesting items
    discovered during midday browsing.
  • Both clusters converge to near-zero fractions after hour 18.

For k=4:
  • Four selling archetypes emerge: early-peak, mid-peak, late-peak, and
    broadly distributed demand across all 24 hours.
  • More granular cluster matching leads to more precise lost-fraction
    estimates per stockout item.

k=2 vs k=4 convergence:
  • Mean estimated demands are very close across both k values (within 1-2%),
    confirming robustness of the clustering approach.
  • The uplift (Est_Demand / Total_sales) is typically 5-15%, meaning
    stockout items could have sold significantly more with sufficient inventory.
  • k=4 is preferred in practice as it captures more distinct patterns
    without overcomplicating the model.
""")
Dataset: (106329, 5)
UberX: 55094 rows  |  Lyft: 51235 rows

=================================================================
Q1 — ONE-WAY ANOVA: Price ~ Source  AND  Price ~ Hour
=================================================================
  [UberX] by source    F= 543.259  p=0.000e+00  → SIGNIFICANT ✓
  [Lyft ] by source    F= 773.976  p=0.000e+00  → SIGNIFICANT ✓
  [Both ] by source    F=1241.295  p=0.000e+00  → SIGNIFICANT ✓
  [UberX] by hour      F=   1.302  p=1.512e-01  → not significant
  [Lyft ] by hour      F=   0.490  p=9.802e-01  → not significant
  [Both ] by hour      F=   1.013  p=4.440e-01  → not significant
No description has been provided for this image
[Saved: Q1_ANOVA.png]

=================================================================
Q2 — OLS: Price ~ Distance
=================================================================
UberX: Price = 6.397 + 1.537×dist | R²=0.5405, AIC=212929.0
Lyft: Price = 5.638 + 1.817×dist | R²=0.6086, AIC=192471.2
No description has been provided for this image
[Saved: Q2_BasicRegression.png]

=================================================================
Q3 — OLS: Price ~ distance + source_dummies + hour_dummies
=================================================================
UberX: R²=0.5463, Adj-R²=0.5460, AIC=212298.6, params=36
Lyft: R²=0.6172, Adj-R²=0.6169, AIC=191404.9, params=36

=================================================================
Q4 — OLS: Interaction Terms Only (no standalone dummies)
=================================================================
UberX: R²=0.5472, Adj-R²=0.5469, AIC=212184.3, params=36
Lyft: R²=0.6192, Adj-R²=0.6189, AIC=191136.3, params=36

=================================================================
Q5 — Full Model: Dummies + Interaction Terms
=================================================================
UberX: R²=0.5535, Adj-R²=0.5530, AIC=211478.6, params=70
Lyft: R²=0.6254, Adj-R²=0.6249, AIC=190363.3, params=70

=================================================================
Q6 — Model Comparison Table
=================================================================
Platform             Model     R²  Adj-R²      AIC  # Params
   UberX         Q2: Basic 0.5405  0.5405 212929.0         2
   UberX      Q3: +Dummies 0.5463  0.5460 212298.6        36
   UberX Q4: +Interactions 0.5472  0.5469 212184.3        36
   UberX          Q5: Full 0.5535  0.5530 211478.6        70
    Lyft         Q2: Basic 0.6086  0.6086 192471.2         2
    Lyft      Q3: +Dummies 0.6172  0.6169 191404.9        36
    Lyft Q4: +Interactions 0.6192  0.6189 191136.3        36
    Lyft          Q5: Full 0.6254  0.6249 190363.3        70
No description has been provided for this image
[Saved: Q6_ModelComparison.png]

Q6 Discussion:
  Q2 — Basic (distance only):
       Lowest R². Distance alone poorly predicts price because zone-level and
       time-of-day surcharges dominate.

  Q3 — + Dummy variables (source & hour):
       Significant R² jump. Adding source dummies captures neighbourhood-based
       pricing zones; hour dummies capture surge/off-peak effects. Both variables
       shift the intercept (base fare level).

  Q4 — + Interaction terms only:
       Interactions allow the price-per-mile RATE to vary by location and hour
       (slope shifts). R² is comparable to Q3, suggesting slope variation is
       nearly as important as level variation.

  Q5 — Full model (dummies + interactions):
       Best R² and Adj-R² — captures both intercept and slope variation.
       Despite higher parameter count, AIC confirms superior fit.

  UberX vs Lyft:
       Lyft shows a stronger distance coefficient and different source-level
       premiums, reflecting distinct pricing algorithms despite serving the
       same market.


=================================================================
FLASHION CASE STUDY — Rue La La Demand Estimation
=================================================================
Loaded Case6-Data.xlsx: (2446, 26)
Stockout: 1051  |  Non-stockout: 1395

K=2 — Cluster sizes:
Label_k2
1    1052
0     343
Cluster centers:
[[0.12324809 0.03475339 0.0358691  0.03703619 0.03645547 0.03471032
  0.03607014 0.03410378 0.03505701 0.04102974 0.03492536 0.0354043
  0.03670707 0.10299534 0.0340178  0.03406202 0.03441144 0.03350751
  0.02390866 0.02259796 0.02315282 0.02334484 0.02285638 0.02209029]
 [0.12403265 0.03951028 0.03992845 0.03924685 0.03970276 0.0400059
  0.0406236  0.03964615 0.03902758 0.03725397 0.03934574 0.03891123
  0.0388802  0.03293217 0.03470363 0.03491499 0.03441461 0.0348405
  0.02397819 0.02360242 0.02355889 0.02431108 0.02382824 0.02172444]]

K=4 — Cluster sizes:
Label_k4
3    419
0    385
2    340
1    251
Cluster centers:
[[0.12475193 0.03456616 0.04525169 0.04834236 0.04416879 0.04045806
  0.04649205 0.04286668 0.04141372 0.03585599 0.0441803  0.02011582
  0.02610384 0.03451458 0.03466846 0.03459754 0.03477261 0.03462376
  0.02357901 0.0232828  0.02423041 0.02519554 0.02337283 0.02135375]
 [0.12261319 0.03503326 0.03474136 0.03902485 0.03702257 0.03325564
  0.03667793 0.03301853 0.03418908 0.04024362 0.03510588 0.03514486
  0.03434508 0.11401695 0.03395053 0.03432303 0.03448507 0.03378919
  0.02342552 0.02254772 0.02354534 0.02290433 0.02291795 0.02135157]
 [0.12290646 0.04394966 0.03981077 0.03435581 0.03622954 0.0397973
  0.03830612 0.03666668 0.03718378 0.04031617 0.03883797 0.06335556
  0.0229624  0.03524513 0.03508346 0.03502071 0.03448158 0.03463424
  0.02476704 0.0224118  0.02257063 0.02277444 0.02463947 0.02237666]
 [0.12449366 0.03923871 0.03491691 0.03318156 0.03736479 0.03946838
  0.03574799 0.03853783 0.03797933 0.03735363 0.03423678 0.03573138
  0.06447419 0.03838267 0.03431744 0.03477724 0.03398651 0.0347456
  0.023979   0.02467179 0.02341951 0.02479704 0.02333815 0.02205866]]
No description has been provided for this image
[Saved: Flashion_Clusters.png]

Sample estimates (first 10):
 Item#  Total sales Stockout_time  Cluster_k2  Lost_Pct_k2  Est_Demand_k2  Cluster_k4  Lost_Pct_k4  Est_Demand_k4
     5         3115       hour 20         0.0     0.091444    3428.518574         1.0     0.090719    3425.784368
     6         4404       hour 22         1.0     0.045553    4614.188674         2.0     0.047016    4621.274424
    12         4943       hour 20         1.0     0.093423    5452.375402         3.0     0.093613    5453.522549
    13         6916       hour 21         1.0     0.069864    7435.469929         2.0     0.069791    7434.884847
    14         4783       hour 21         1.0     0.069864    5142.257471         3.0     0.070194    5144.082995
    17         6294       hour 22         0.0     0.044947    6590.207859         1.0     0.044270    6585.538594
    19         4716       hour 21         0.0     0.068292    5061.668992         3.0     0.070194    5072.024965
    26         2699       hour 22         0.0     0.044947    2826.020180         1.0     0.044270    2824.017901
    27         3267       hour 24         0.0     0.000000    3267.000000         1.0     0.000000    3267.000000
    32         4482       hour 23         1.0     0.021724    4581.531194         3.0     0.022059    4583.096977

Mean estimated demand  k=2: 10602
Mean estimated demand  k=4: 10662
No description has been provided for this image
[Saved: Flashion_EstVsActual.png]
[Saved: stockout_demand_estimates.csv]

Flashion Pattern Discussion:

For k=2:
  • Cluster 1 — "Early-rush": High fraction sold in hour 1, sharp drop after.
    These items attract buyers immediately when the event opens.
  • Cluster 2 — "Mid-event": Spike around hours 13-14, suggesting items
    discovered during midday browsing.
  • Both clusters converge to near-zero fractions after hour 18.

For k=4:
  • Four selling archetypes emerge: early-peak, mid-peak, late-peak, and
    broadly distributed demand across all 24 hours.
  • More granular cluster matching leads to more precise lost-fraction
    estimates per stockout item.

k=2 vs k=4 convergence:
  • Mean estimated demands are very close across both k values (within 1-2%),
    confirming robustness of the clustering approach.
  • The uplift (Est_Demand / Total_sales) is typically 5-15%, meaning
    stockout items could have sold significantly more with sufficient inventory.
  • k=4 is preferred in practice as it captures more distinct patterns
    without overcomplicating the model.

In [ ]: