{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": "# Day 18 - CUDA Libraries and Graphs\n\nCompanion notebook for cuBLAS/cuDNN/NCCL/CUDA Graphs. CUDA-specific cells skip cleanly without CUDA.\n"
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": "import math, time\nimport numpy as np\nprint('numpy', np.__version__)\n",
      "outputs": [],
      "execution_count": null
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": "## 1. Launch Overhead Budget\n"
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": "def launch_overhead_ms(layers, kernels_per_layer, launch_us):\n    return layers * kernels_per_layer * launch_us / 1000\n\noverhead = launch_overhead_ms(32, 8, 10)\nbudget_ms = 1000 / 50\nprint('overhead ms/token:', overhead)\nprint('50 tok/s budget ms/token:', budget_ms)\nprint('share:', overhead / budget_ms)\nassert abs(overhead - 2.56) < 1e-9\n",
      "outputs": [],
      "execution_count": null
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": "## 2. NCCL Ring All-Reduce Estimate\n"
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": "def ring_allreduce_time_ms(message_gb, gpus, bandwidth_gb_s):\n    seconds = 2 * (gpus - 1) / gpus * message_gb / bandwidth_gb_s\n    return seconds * 1000\n\nprint('1GB on 8 GPUs at 600GB/s:', ring_allreduce_time_ms(1, 8, 600), 'ms')\n",
      "outputs": [],
      "execution_count": null
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": "## 3. Optional PyTorch Matmul Path\n"
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": "try:\n    import torch\n    device = 'cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu')\n    print('torch', torch.__version__, 'device', device)\n    x = torch.randn(256, 256, device=device)\n    w = torch.randn(256, 256, device=device)\n    for _ in range(3):\n        y = x @ w\n    if device == 'cuda': torch.cuda.synchronize()\n    if device == 'mps': torch.mps.synchronize()\n    t0 = time.perf_counter(); y = x @ w\n    if device == 'cuda': torch.cuda.synchronize()\n    if device == 'mps': torch.mps.synchronize()\n    print('matmul ms:', (time.perf_counter() - t0) * 1000)\nexcept Exception as e:\n    print('Skipping torch:', e)\n",
      "outputs": [],
      "execution_count": null
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": "## 4. CUDA Graph Pseudocode\n\nThis cell prints the actual pattern. It only captures graphs on CUDA.\n"
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": "try:\n    import torch\n    if not torch.cuda.is_available():\n        print('CUDA unavailable; graph capture skipped.')\n    else:\n        device = 'cuda'\n        x = torch.randn(1, 4096, device=device)\n        w = torch.randn(4096, 4096, device=device)\n        static_out = torch.empty(1, 4096, device=device)\n        g = torch.cuda.CUDAGraph()\n        for _ in range(3): static_out.copy_(x @ w)\n        torch.cuda.synchronize()\n        with torch.cuda.graph(g):\n            static_out.copy_(x @ w)\n        g.replay(); torch.cuda.synchronize()\n        print('Captured and replayed one matmul graph.')\nexcept Exception as e:\n    print('Graph demo skipped:', e)\n",
      "outputs": [],
      "execution_count": null
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "name": "python",
      "pygments_lexer": "ipython3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}
