bifitnex-trading/backtest/whale_correlation.py

#!/usr/bin/env python3
"""Fetch blockchain.com on-chain metrics and analyze correlation with BTC price changes.

Whale proxy metrics (all free, daily granularity):
- estimated-transaction-volume (BTC): total estimated tx volume
- n-transactions: daily confirmed transaction count
- Derived: avg_tx_size = volume / n_transactions (whale activity proxy)
- output-volume (BTC): total output value

Correlation targets:
- BTC next-day return
- BTC next-3-day return
"""

import os
import sys
import time

import pandas as pd
import numpy as np
import requests

sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

CACHE_DIR = os.path.join(os.path.dirname(__file__), "..", "cache", "backtest")
BTC_1H_CACHE = os.path.join(CACHE_DIR, "tBTCUST_1h.csv")

BLOCKCHAIN_API = "https://api.blockchain.info/charts"
METRICS = [
    "estimated-transaction-volume",      # BTC total est. tx volume
    "estimated-transaction-volume-usd",  # USD total est. tx volume
    "n-transactions",                    # daily confirmed tx count
    "output-volume",                     # total output value (BTC)
    "n-unique-addresses",                # unique addresses per day
]


def fetch_blockchain_metric(name: str, start: str, end: str) -> pd.DataFrame:
    """Fetch a single blockchain.com chart metric."""
    start_ts = int(pd.Timestamp(start).timestamp())
    end_ts = int(pd.Timestamp(end).timestamp())
    # timespan is calculated from end; we use start param to set beginning
    url = f"{BLOCKCHAIN_API}/{name}"
    params = {
        "format": "json",
        "start": start_ts,
        "timespan": "1year",  # large enough window
    }
    resp = requests.get(url, params=params, timeout=30)
    resp.raise_for_status()
    data = resp.json()

    values = data.get("values", [])
    if not values:
        return pd.DataFrame()

    df = pd.DataFrame(values)
    df.columns = ["timestamp", name]
    df["date"] = pd.to_datetime(df["timestamp"], unit="s").dt.date
    df = df[["date", name]]

    # Filter to requested range
    start_date = pd.Timestamp(start).date()
    end_date = pd.Timestamp(end).date()
    df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]

    return df


def load_btc_daily_prices() -> pd.DataFrame:
    """Load BTC 1h cache and resample to daily OHLC."""
    if not os.path.exists(BTC_1H_CACHE):
        print(f"ERROR: BTC 1h cache not found at {BTC_1H_CACHE}")
        print("Run backtest first to populate the cache.")
        sys.exit(1)

    df = pd.read_csv(BTC_1H_CACHE, parse_dates=["timestamp"])
    df["date"] = df["timestamp"].dt.date
    daily = df.groupby("date").agg(
        open=("open", "first"),
        high=("high", "max"),
        low=("low", "min"),
        close=("close", "last"),
        volume=("volume", "sum"),
    ).reset_index()
    return daily


def main():
    start = "2025-07-01"
    end = "2026-03-17"

    print("=== Whale Activity ↔ BTC Price Correlation Analysis ===\n")

    # Step 1: Fetch on-chain metrics
    print("Fetching blockchain.com metrics...")
    metrics_dfs = []
    for metric in METRICS:
        print(f"  {metric}...", end=" ", flush=True)
        try:
            df = fetch_blockchain_metric(metric, start, end)
            print(f"{len(df)} days")
            metrics_dfs.append(df)
        except Exception as e:
            print(f"FAILED: {e}")
        time.sleep(2)  # rate limit: 1 req / 10 sec (be conservative)

    if not metrics_dfs:
        print("ERROR: No metrics fetched")
        return

    # Merge all metrics on date
    onchain = metrics_dfs[0]
    for df in metrics_dfs[1:]:
        onchain = onchain.merge(df, on="date", how="outer")
    onchain = onchain.sort_values("date").reset_index(drop=True)

    # Derived metrics
    if "estimated-transaction-volume" in onchain.columns and "n-transactions" in onchain.columns:
        onchain["avg_tx_size_btc"] = onchain["estimated-transaction-volume"] / onchain["n-transactions"]
    if "estimated-transaction-volume-usd" in onchain.columns and "n-transactions" in onchain.columns:
        onchain["avg_tx_size_usd"] = onchain["estimated-transaction-volume-usd"] / onchain["n-transactions"]

    print(f"\nOn-chain data: {len(onchain)} days")

    # Step 2: Load BTC prices
    print("Loading BTC daily prices from cache...")
    btc = load_btc_daily_prices()
    print(f"BTC daily data: {len(btc)} days")

    # Step 3: Merge and compute returns
    merged = onchain.merge(btc[["date", "close", "volume"]], on="date", how="inner")
    merged = merged.rename(columns={"close": "btc_close", "volume": "btc_volume"})
    merged = merged.sort_values("date").reset_index(drop=True)

    # Price returns (forward-looking)
    merged["ret_1d"] = merged["btc_close"].pct_change().shift(-1)   # next-day return
    merged["ret_3d"] = merged["btc_close"].pct_change(3).shift(-3)  # next-3-day return
    merged["ret_5d"] = merged["btc_close"].pct_change(5).shift(-5)  # next-5-day return

    # Z-score normalization for on-chain metrics (rolling 30-day)
    onchain_cols = [c for c in merged.columns if c not in
                    ["date", "btc_close", "btc_volume", "ret_1d", "ret_3d", "ret_5d"]]

    for col in onchain_cols:
        roll_mean = merged[col].rolling(30, min_periods=10).mean()
        roll_std = merged[col].rolling(30, min_periods=10).std()
        merged[f"{col}_zscore"] = (merged[col] - roll_mean) / roll_std.replace(0, np.nan)

    # Step 4: Correlation analysis
    print(f"\nMerged dataset: {len(merged)} days")
    print(f"Date range: {merged['date'].iloc[0]} to {merged['date'].iloc[-1]}")

    # Raw correlations
    zscore_cols = [c for c in merged.columns if c.endswith("_zscore")]
    target_cols = ["ret_1d", "ret_3d", "ret_5d"]

    print("\n" + "=" * 70)
    print("  PEARSON CORRELATION: On-Chain Metrics ↔ BTC Forward Returns")
    print("=" * 70)

    valid = merged.dropna(subset=target_cols + zscore_cols)
    print(f"  (Using {len(valid)} complete observations)\n")

    results = []
    for oc_col in zscore_cols:
        for target in target_cols:
            corr = valid[oc_col].corr(valid[target])
            results.append({"metric": oc_col, "target": target, "corr": corr})

    results_df = pd.DataFrame(results)

    # Print as pivot table
    pivot = results_df.pivot(index="metric", columns="target", values="corr")
    pivot = pivot[target_cols]  # order columns

    # Sort by absolute correlation with ret_1d
    pivot["abs_ret_1d"] = pivot["ret_1d"].abs()
    pivot = pivot.sort_values("abs_ret_1d", ascending=False)
    pivot = pivot.drop(columns="abs_ret_1d")

    for metric in pivot.index:
        name = metric.replace("_zscore", "")
        vals = "  ".join(f"{pivot.loc[metric, t]:+.4f}" for t in target_cols)
        print(f"  {name:<35s} {vals}")

    print(f"\n  {'':35s} {'ret_1d':>8s}  {'ret_3d':>8s}  {'ret_5d':>8s}")

    # Step 5: Highlight significant correlations
    print("\n" + "=" * 70)
    print("  NOTABLE CORRELATIONS (|r| > 0.10)")
    print("=" * 70)

    notable = results_df[results_df["corr"].abs() > 0.10].sort_values("corr", key=abs, ascending=False)
    if notable.empty:
        print("  None found — on-chain metrics show weak correlation with BTC returns.")
    else:
        for _, row in notable.iterrows():
            direction = "↑↑" if row["corr"] > 0 else "↓↑" if row["corr"] < 0 else "  "
            name = row["metric"].replace("_zscore", "")
            print(f"  {direction} {name:<35s} → {row['target']}: r={row['corr']:+.4f}")

    # Step 6: Extreme value analysis (whale spikes)
    print("\n" + "=" * 70)
    print("  EXTREME VALUE ANALYSIS (Top/Bottom 10% Days)")
    print("=" * 70)

    for col_name in ["avg_tx_size_btc", "estimated-transaction-volume", "avg_tx_size_usd"]:
        zscore_col = f"{col_name}_zscore"
        if zscore_col not in merged.columns:
            continue

        valid_ext = merged.dropna(subset=[zscore_col, "ret_1d"])
        if len(valid_ext) < 20:
            continue

        q10 = valid_ext[zscore_col].quantile(0.10)
        q90 = valid_ext[zscore_col].quantile(0.90)

        low_days = valid_ext[valid_ext[zscore_col] <= q10]
        high_days = valid_ext[valid_ext[zscore_col] >= q90]
        all_avg = valid_ext["ret_1d"].mean()

        print(f"\n  {col_name}:")
        print(f"    Low activity days (bottom 10%):  avg next-day ret = {low_days['ret_1d'].mean():+.4f}  (n={len(low_days)})")
        print(f"    High activity days (top 10%):    avg next-day ret = {high_days['ret_1d'].mean():+.4f}  (n={len(high_days)})")
        print(f"    All days average:                avg next-day ret = {all_avg:+.4f}  (n={len(valid_ext)})")

    # Save merged data for further analysis
    out_path = os.path.join(CACHE_DIR, "whale_correlation_data.csv")
    merged.to_csv(out_path, index=False)
    print(f"\nSaved merged dataset to {out_path}")
    print("Done.")


if __name__ == "__main__":
    main()