#!/usr/bin/env python3 """Fetch blockchain.com on-chain metrics and analyze correlation with BTC price changes. Whale proxy metrics (all free, daily granularity): - estimated-transaction-volume (BTC): total estimated tx volume - n-transactions: daily confirmed transaction count - Derived: avg_tx_size = volume / n_transactions (whale activity proxy) - output-volume (BTC): total output value Correlation targets: - BTC next-day return - BTC next-3-day return """ import os import sys import time import pandas as pd import numpy as np import requests sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) CACHE_DIR = os.path.join(os.path.dirname(__file__), "..", "cache", "backtest") BTC_1H_CACHE = os.path.join(CACHE_DIR, "tBTCUST_1h.csv") BLOCKCHAIN_API = "https://api.blockchain.info/charts" METRICS = [ "estimated-transaction-volume", # BTC total est. tx volume "estimated-transaction-volume-usd", # USD total est. tx volume "n-transactions", # daily confirmed tx count "output-volume", # total output value (BTC) "n-unique-addresses", # unique addresses per day ] def fetch_blockchain_metric(name: str, start: str, end: str) -> pd.DataFrame: """Fetch a single blockchain.com chart metric.""" start_ts = int(pd.Timestamp(start).timestamp()) end_ts = int(pd.Timestamp(end).timestamp()) # timespan is calculated from end; we use start param to set beginning url = f"{BLOCKCHAIN_API}/{name}" params = { "format": "json", "start": start_ts, "timespan": "1year", # large enough window } resp = requests.get(url, params=params, timeout=30) resp.raise_for_status() data = resp.json() values = data.get("values", []) if not values: return pd.DataFrame() df = pd.DataFrame(values) df.columns = ["timestamp", name] df["date"] = pd.to_datetime(df["timestamp"], unit="s").dt.date df = df[["date", name]] # Filter to requested range start_date = pd.Timestamp(start).date() end_date = pd.Timestamp(end).date() df = df[(df["date"] >= start_date) & (df["date"] <= end_date)] return df def load_btc_daily_prices() -> pd.DataFrame: """Load BTC 1h cache and resample to daily OHLC.""" if not os.path.exists(BTC_1H_CACHE): print(f"ERROR: BTC 1h cache not found at {BTC_1H_CACHE}") print("Run backtest first to populate the cache.") sys.exit(1) df = pd.read_csv(BTC_1H_CACHE, parse_dates=["timestamp"]) df["date"] = df["timestamp"].dt.date daily = df.groupby("date").agg( open=("open", "first"), high=("high", "max"), low=("low", "min"), close=("close", "last"), volume=("volume", "sum"), ).reset_index() return daily def main(): start = "2025-07-01" end = "2026-03-17" print("=== Whale Activity ↔ BTC Price Correlation Analysis ===\n") # Step 1: Fetch on-chain metrics print("Fetching blockchain.com metrics...") metrics_dfs = [] for metric in METRICS: print(f" {metric}...", end=" ", flush=True) try: df = fetch_blockchain_metric(metric, start, end) print(f"{len(df)} days") metrics_dfs.append(df) except Exception as e: print(f"FAILED: {e}") time.sleep(2) # rate limit: 1 req / 10 sec (be conservative) if not metrics_dfs: print("ERROR: No metrics fetched") return # Merge all metrics on date onchain = metrics_dfs[0] for df in metrics_dfs[1:]: onchain = onchain.merge(df, on="date", how="outer") onchain = onchain.sort_values("date").reset_index(drop=True) # Derived metrics if "estimated-transaction-volume" in onchain.columns and "n-transactions" in onchain.columns: onchain["avg_tx_size_btc"] = onchain["estimated-transaction-volume"] / onchain["n-transactions"] if "estimated-transaction-volume-usd" in onchain.columns and "n-transactions" in onchain.columns: onchain["avg_tx_size_usd"] = onchain["estimated-transaction-volume-usd"] / onchain["n-transactions"] print(f"\nOn-chain data: {len(onchain)} days") # Step 2: Load BTC prices print("Loading BTC daily prices from cache...") btc = load_btc_daily_prices() print(f"BTC daily data: {len(btc)} days") # Step 3: Merge and compute returns merged = onchain.merge(btc[["date", "close", "volume"]], on="date", how="inner") merged = merged.rename(columns={"close": "btc_close", "volume": "btc_volume"}) merged = merged.sort_values("date").reset_index(drop=True) # Price returns (forward-looking) merged["ret_1d"] = merged["btc_close"].pct_change().shift(-1) # next-day return merged["ret_3d"] = merged["btc_close"].pct_change(3).shift(-3) # next-3-day return merged["ret_5d"] = merged["btc_close"].pct_change(5).shift(-5) # next-5-day return # Z-score normalization for on-chain metrics (rolling 30-day) onchain_cols = [c for c in merged.columns if c not in ["date", "btc_close", "btc_volume", "ret_1d", "ret_3d", "ret_5d"]] for col in onchain_cols: roll_mean = merged[col].rolling(30, min_periods=10).mean() roll_std = merged[col].rolling(30, min_periods=10).std() merged[f"{col}_zscore"] = (merged[col] - roll_mean) / roll_std.replace(0, np.nan) # Step 4: Correlation analysis print(f"\nMerged dataset: {len(merged)} days") print(f"Date range: {merged['date'].iloc[0]} to {merged['date'].iloc[-1]}") # Raw correlations zscore_cols = [c for c in merged.columns if c.endswith("_zscore")] target_cols = ["ret_1d", "ret_3d", "ret_5d"] print("\n" + "=" * 70) print(" PEARSON CORRELATION: On-Chain Metrics ↔ BTC Forward Returns") print("=" * 70) valid = merged.dropna(subset=target_cols + zscore_cols) print(f" (Using {len(valid)} complete observations)\n") results = [] for oc_col in zscore_cols: for target in target_cols: corr = valid[oc_col].corr(valid[target]) results.append({"metric": oc_col, "target": target, "corr": corr}) results_df = pd.DataFrame(results) # Print as pivot table pivot = results_df.pivot(index="metric", columns="target", values="corr") pivot = pivot[target_cols] # order columns # Sort by absolute correlation with ret_1d pivot["abs_ret_1d"] = pivot["ret_1d"].abs() pivot = pivot.sort_values("abs_ret_1d", ascending=False) pivot = pivot.drop(columns="abs_ret_1d") for metric in pivot.index: name = metric.replace("_zscore", "") vals = " ".join(f"{pivot.loc[metric, t]:+.4f}" for t in target_cols) print(f" {name:<35s} {vals}") print(f"\n {'':35s} {'ret_1d':>8s} {'ret_3d':>8s} {'ret_5d':>8s}") # Step 5: Highlight significant correlations print("\n" + "=" * 70) print(" NOTABLE CORRELATIONS (|r| > 0.10)") print("=" * 70) notable = results_df[results_df["corr"].abs() > 0.10].sort_values("corr", key=abs, ascending=False) if notable.empty: print(" None found — on-chain metrics show weak correlation with BTC returns.") else: for _, row in notable.iterrows(): direction = "↑↑" if row["corr"] > 0 else "↓↑" if row["corr"] < 0 else " " name = row["metric"].replace("_zscore", "") print(f" {direction} {name:<35s} → {row['target']}: r={row['corr']:+.4f}") # Step 6: Extreme value analysis (whale spikes) print("\n" + "=" * 70) print(" EXTREME VALUE ANALYSIS (Top/Bottom 10% Days)") print("=" * 70) for col_name in ["avg_tx_size_btc", "estimated-transaction-volume", "avg_tx_size_usd"]: zscore_col = f"{col_name}_zscore" if zscore_col not in merged.columns: continue valid_ext = merged.dropna(subset=[zscore_col, "ret_1d"]) if len(valid_ext) < 20: continue q10 = valid_ext[zscore_col].quantile(0.10) q90 = valid_ext[zscore_col].quantile(0.90) low_days = valid_ext[valid_ext[zscore_col] <= q10] high_days = valid_ext[valid_ext[zscore_col] >= q90] all_avg = valid_ext["ret_1d"].mean() print(f"\n {col_name}:") print(f" Low activity days (bottom 10%): avg next-day ret = {low_days['ret_1d'].mean():+.4f} (n={len(low_days)})") print(f" High activity days (top 10%): avg next-day ret = {high_days['ret_1d'].mean():+.4f} (n={len(high_days)})") print(f" All days average: avg next-day ret = {all_avg:+.4f} (n={len(valid_ext)})") # Save merged data for further analysis out_path = os.path.join(CACHE_DIR, "whale_correlation_data.csv") merged.to_csv(out_path, index=False) print(f"\nSaved merged dataset to {out_path}") print("Done.") if __name__ == "__main__": main()