add engram-lite, add log, tune scaling laws analysis scripts

2026-01-27 22:31:17 +00:00
parent 59e36cc727
commit c8d93beed2
5 changed files with 346 additions and 35 deletions
@@ -15,14 +15,16 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "%matplotlib inline\n",
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Load results\n",
+    "tag = \"jan26\"\n",
    "base_dir = os.environ.get('NANOCHAT_BASE_DIR', os.path.expanduser('~/.cache/nanochat'))\n",
-    "results_path = os.path.join(base_dir, 'scaling_laws_results', 'results.csv')\n",
+    "results_path = os.path.join(base_dir, f'scaling_laws_results_{tag}', 'results.csv')\n",
    "\n",
    "df = pd.read_csv(results_path)\n",
    "flops_budgets = sorted(df['flops_budget'].unique())\n",
@@ -31,6 +33,99 @@
    "df"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# =============================================================================\n",
+    "# FILTERING: Remove incomplete or problematic runs\n",
+    "# =============================================================================\n",
+    "\n",
+    "print(f\"Before filtering: {len(df)} runs\")\n",
+    "\n",
+    "# Filter out runs with missing/invalid val_bpb (incomplete runs)\n",
+    "df = df[df['val_bpb'].notna() & (df['val_bpb'] > 0)]\n",
+    "\n",
+    "# Optional: exclude specific flops budgets that aren't done yet\n",
+    "# exclude_flops = [1e19]  # <-- adjust as runs complete\n",
+    "# df = df[~df['flops_budget'].isin(exclude_flops)]\n",
+    "\n",
+    "# Optional: exclude specific depths\n",
+    "# exclude_depths = [18, 20]\n",
+    "# df = df[~df['depth'].isin(exclude_depths)]\n",
+    "\n",
+    "print(f\"After filtering: {len(df)} runs\")\n",
+    "print(f\"FLOPs budgets: {sorted(df['flops_budget'].unique())}\")\n",
+    "print(f\"Depths: {sorted(df['depth'].unique())}\")\n",
+    "\n",
+    "# Update flops_budgets list after filtering\n",
+    "flops_budgets = sorted(df['flops_budget'].unique())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Effective Parameter Count\n",
+    "\n",
+    "Different scaling law papers use different conventions for counting parameters:\n",
+    "- **Kaplan et al.** excluded embedding parameters (claimed cleaner laws)\n",
+    "- **Chinchilla** included all parameters (and noted Kaplan had a bug)\n",
+    "\n",
+    "Our CSV now has granular counts:\n",
+    "- `params_wte` - token embedding (lookup table)\n",
+    "- `params_bigram_embed` - bigram hash embeddings (lookup table)\n",
+    "- `params_value_embeds` - value embeddings (lookup table)\n",
+    "- `params_lm_head` - unembedding projection (matmul)\n",
+    "- `params_transformer` - attention + MLP matrices (matmuls)\n",
+    "- `params_scalars` - resid/x0/bigram lambdas (tiny)\n",
+    "\n",
+    "**Experiment below** with different combinations to see which gives the cleanest scaling laws."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# =============================================================================\n",
+    "# EXPERIMENT HERE: Define which parameters to count for scaling laws\n",
+    "# =============================================================================\n",
+    "\n",
+    "def compute_effective_params(row):\n",
+    "    \"\"\"\n",
+    "    Compute the 'effective' parameter count for scaling law analysis.\n",
+    "\n",
+    "    Modify this function to experiment with different conventions:\n",
+    "    - Chinchilla-style: include everything\n",
+    "    - Kaplan-style: exclude embeddings\n",
+    "    - Matmul-only: just transformer + lm_head (the actual compute)\n",
+    "    - etc.\n",
+    "    \"\"\"\n",
+    "    # Option 1: Chinchilla-style (all params)\n",
+    "    # return row['params_total']\n",
+    "\n",
+    "    # Option 2: Kaplan-style (exclude embeddings)\n",
+    "    return row['params_transformer'] + row['params_lm_head']\n",
+    "\n",
+    "    # Option 3: Transformer-only (exclude all embeddings AND lm_head)\n",
+    "    # return row['params_transformer']\n",
+    "\n",
+    "\n",
+    "# Compute derived columns\n",
+    "df['effective_params'] = df.apply(compute_effective_params, axis=1)\n",
+    "df['param_data_ratio'] = df['tokens_trained'] / df['effective_params']\n",
+    "\n",
+    "# Show parameter breakdown for first few rows\n",
+    "print(\"Parameter breakdown (first row per flops budget):\")\n",
+    "param_cols = ['depth', 'params_wte', 'params_bigram_embed', 'params_value_embeds',\n",
+    "              'params_lm_head', 'params_transformer', 'params_scalars', 'params_total', 'effective_params']\n",
+    "df.groupby('flops_budget').first()[param_cols]"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@@ -54,11 +149,11 @@
    "optimal_by_bpb = []\n",
    "\n",
    "for flops, color in zip(flops_budgets, colors):\n",
-    "    subset = df[df['flops_budget'] == flops].sort_values('num_scaling_params')\n",
-    "    ax.plot(subset['num_scaling_params'], subset['val_bpb'], 'o', color=color, label=f'{flops:.0e}', markersize=8)\n",
+    "    subset = df[df['flops_budget'] == flops].sort_values('effective_params')\n",
+    "    ax.plot(subset['effective_params'], subset['val_bpb'], 'o', color=color, label=f'{flops:.0e}', markersize=8)\n",
    "\n",
    "    # Fit quadratic in log-space: val_bpb = a*(log N)^2 + b*(log N) + c\n",
-    "    log_params = np.log10(subset['num_scaling_params'])\n",
+    "    log_params = np.log10(subset['effective_params'])\n",
    "    coeffs = np.polyfit(log_params, subset['val_bpb'], 2)\n",
    "    a, b, c = coeffs\n",
    "\n",
@@ -83,13 +178,13 @@
    "        # Fallback to raw minimum if quadratic doesn't have minimum\n",
    "        best_idx = subset['val_bpb'].idxmin()\n",
    "        best = subset.loc[best_idx]\n",
-    "        ax.scatter([best['num_scaling_params']], [best['val_bpb']], s=150, color=color,\n",
+    "        ax.scatter([best['effective_params']], [best['val_bpb']], s=150, color=color,\n",
    "                   zorder=5, edgecolors='black', linewidths=2)\n",
-    "        optimal_by_bpb.append({'flops': flops, 'params': best['num_scaling_params'],\n",
+    "        optimal_by_bpb.append({'flops': flops, 'params': best['effective_params'],\n",
    "                              'tokens': best['tokens_trained'], 'ratio': best['param_data_ratio'], 'bpb': best['val_bpb']})\n",
    "\n",
    "ax.set_xscale('log')\n",
-    "ax.set_xlabel('Parameters')\n",
+    "ax.set_xlabel('Effective Parameters')\n",
    "ax.set_ylabel('Validation Loss (bpb)')\n",
    "ax.set_title('IsoFLOP Curves')\n",
    "ax.legend(title='FLOPs', loc='upper right')\n",
@@ -138,10 +233,61 @@
    "\n",
    "# Print the optimal points (from quadratic fits)\n",
    "print(\"\\nOptimal configurations (from quadratic fits):\")\n",
-    "print(f\"{'FLOPs':<12} {'Params':<15} {'Tokens':<15} {'Ratio':<10} {'Val BPB':<10}\")\n",
+    "print(f\"{'FLOPs':<12} {'Eff Params':<15} {'Tokens':<15} {'Ratio':<10} {'Val BPB':<10}\")\n",
    "print(\"-\" * 65)\n",
    "for _, row in opt_df.iterrows():\n",
-    "    print(f\"{row['flops']:<12.0e} {int(row['params']):<15,} {int(row['tokens']):<15,} {row['ratio']:<10.1f} {row['bpb']:<10.4f}\")\n"
+    "    print(f\"{row['flops']:<12.0e} {int(row['params']):<15,} {int(row['tokens']):<15,} {row['ratio']:<10.1f} {row['bpb']:<10.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# =============================================================================\n",
+    "# Optimal Ratio Summary (from power law fits)\n",
+    "# =============================================================================\n",
+    "\n",
+    "# From the power law fits: N ∝ C^a and D ∝ C^b\n",
+    "# The ratio D/N ∝ C^(b-a). If a ≈ b, ratio is roughly constant.\n",
+    "\n",
+    "if len(opt_df) >= 2:\n",
+    "    log_f = np.log10(opt_df['flops'])\n",
+    "    log_p = np.log10(opt_df['params'])\n",
+    "    log_t = np.log10(opt_df['tokens'])\n",
+    "\n",
+    "    # Fit power laws\n",
+    "    slope_n, intercept_n = np.polyfit(log_f, log_p, 1)\n",
+    "    slope_d, intercept_d = np.polyfit(log_f, log_t, 1)\n",
+    "\n",
+    "    # The ratio D/N at a reference compute (geometric mean of our budgets)\n",
+    "    ref_flops = np.sqrt(opt_df['flops'].min() * opt_df['flops'].max())\n",
+    "    log_ref = np.log10(ref_flops)\n",
+    "\n",
+    "    # Predicted optimal N and D at reference compute\n",
+    "    pred_log_n = intercept_n + slope_n * log_ref\n",
+    "    pred_log_d = intercept_d + slope_d * log_ref\n",
+    "    optimal_ratio = 10**(pred_log_d - pred_log_n)\n",
+    "\n",
+    "    # Also compute from the fitted optimals directly (mean and std)\n",
+    "    mean_ratio = opt_df['ratio'].mean()\n",
+    "    std_ratio = opt_df['ratio'].std()\n",
+    "\n",
+    "    print(\"=\" * 60)\n",
+    "    print(\"OPTIMAL RATIO SUMMARY\")\n",
+    "    print(\"=\" * 60)\n",
+    "    print(f\"\\nPower law exponents:\")\n",
+    "    print(f\"  N ∝ C^{slope_n:.3f}\")\n",
+    "    print(f\"  D ∝ C^{slope_d:.3f}\")\n",
+    "    print(f\"  Ratio exponent (b-a): {slope_d - slope_n:.3f}  (should be ~0 if ratio is constant)\")\n",
+    "    print(f\"\\nOptimal ratio (tokens per effective param):\")\n",
+    "    print(f\"  From power law at C={ref_flops:.1e}: {optimal_ratio:.1f}\")\n",
+    "    print(f\"  Mean across budgets: {mean_ratio:.1f} ± {std_ratio:.1f}\")\n",
+    "    print(f\"  Chinchilla reference: 20\")\n",
+    "    print(f\"\\nPer-budget ratios: {[f'{r:.1f}' for r in opt_df['ratio'].values]}\")\n",
+    "else:\n",
+    "    print(\"Need at least 2 flops budgets to compute power law fits\")"
   ]
  },
  {