ll

flowchart TD
    subgraph Python_Process["🧠 Python Process (User Space)"]
        A["Python Code\n torch.matmul(A, B)"]
        B["PyTorch C++ Bindings\n torch._C / ATen"]
    end

    subgraph GPU_Driver["⚙️ CUDA Runtime / Driver"]
        C["CUDA API Call\n cudaLaunchKernel(...)"]
        D["GPU Device Memory\n Tensor data in VRAM"]
    end

    subgraph GPU_Hardware["🔩 GPU Hardware"]
        E["Streaming Multiprocessors\n execute threads/warps"]
    end

    A -->|"calls into"| B
    B -->|"enqueues kernel via CUDA API"| C
    C -->|"launches kernel on"| E
    E -->|"reads/writes"| D

k