{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": "# Day 17 - CUDA Vector Add and Matmul\n\nThis notebook is dependency-aware. It runs CUDA source-generation and indexing checks anywhere, and only compiles kernels when `nvcc` is present.\n"
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": "import shutil, subprocess, tempfile, textwrap, os\nimport numpy as np\nprint('numpy', np.__version__)\nprint('nvcc:', shutil.which('nvcc'))\n",
      "outputs": [],
      "execution_count": null
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": "## 1. Thread Indexing by Hand\n"
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": "def global_idx(block_idx, block_dim, thread_idx):\n    return block_idx * block_dim + thread_idx\n\ncases = [(0,256,0), (0,256,255), (1,256,0), (3,256,10)]\nfor b, bd, t in cases:\n    print(f'block={b}, blockDim={bd}, thread={t} -> idx={global_idx(b, bd, t)}')\nassert global_idx(3, 256, 10) == 778\n",
      "outputs": [],
      "execution_count": null
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": "## 2. Vector Add Kernel Source\n"
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": "vector_add_cu = r'''\n#include <stdio.h>\n\n__global__ void vec_add(const float* a, const float* b, float* c, int n) {\n    int idx = blockIdx.x * blockDim.x + threadIdx.x;\n    if (idx < n) c[idx] = a[idx] + b[idx];\n}\n'''\nprint(vector_add_cu)\n",
      "outputs": [],
      "execution_count": null
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": "## 3. Coalescing Simulation\n\nA warp reading consecutive addresses touches fewer memory segments than a warp reading with a stride.\n"
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": "def memory_segments(addresses, segment_size=32):\n    return sorted(set(int(a) // segment_size for a in addresses))\n\ncoalesced = np.arange(32)\nstrided = np.arange(32) * 2\nprint('coalesced segments:', memory_segments(coalesced))\nprint('strided segments:  ', memory_segments(strided))\nassert len(memory_segments(strided)) >= len(memory_segments(coalesced))\n",
      "outputs": [],
      "execution_count": null
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": "## 4. Matmul Traffic Estimate\n"
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": "def naive_matmul_loads(M, N, K):\n    # For each C element, load K values from A and K from B.\n    return M * N * 2 * K\n\ndef tiled_matmul_loads(M, N, K, tile=32):\n    # Rough estimate: each tile load reused by tile outputs.\n    return naive_matmul_loads(M, N, K) / tile\n\nM = N = K = 1024\nnaive = naive_matmul_loads(M,N,K)\ntiled = tiled_matmul_loads(M,N,K,32)\nprint(f'naive scalar loads ~= {naive/1e9:.2f}B')\nprint(f'tiled scalar loads ~= {tiled/1e9:.2f}B')\nprint(f'load reduction ~= {naive/tiled:.0f}x')\n",
      "outputs": [],
      "execution_count": null
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": "## 5. Optional nvcc Compile Skeleton\n\nThe compile path is intentionally skipped when CUDA is unavailable.\n"
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": "if shutil.which('nvcc') is None:\n    print('No nvcc found; skipping compile. Run this cell on an NVIDIA CUDA machine.')\nelse:\n    with tempfile.TemporaryDirectory() as td:\n        src = os.path.join(td, 'vec_add.cu')\n        exe = os.path.join(td, 'vec_add')\n        full = vector_add_cu + r'''\nint main() { printf(\"compile smoke test ok\\n\"); return 0; }\n'''\n        open(src, 'w').write(full)\n        subprocess.run(['nvcc', src, '-o', exe], check=True)\n        subprocess.run([exe], check=True)\n",
      "outputs": [],
      "execution_count": null
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "name": "python",
      "pygments_lexer": "ipython3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}
