Measuring performance

Run under nsight-cu:

nv-nsight-cu-cli --nvtx --profile-from-start=off --section=SpeedOfLight --section=julia --project=examples examples/performance.jl

Results:

Collated results on a V100:

Kernel	Time	Speed of Light Mem %
naive (32, 32)	1.19ms	65.06%
naive (1024, 1)	1.79ms	56.13 %
naive (1, 1024)	3.03ms	60.02 %

Full output:

==PROF==   0: Naive transpose (32, 32)
    Section: GPU Speed Of Light
    ---------------------------------------------------------------------- --------------- ------------------------------
    Memory Frequency                                                         cycle/usecond                         878.88
    SOL FB                                                                               %                          38.16
    Elapsed Cycles                                                                   cycle                      1,447,874
    SM Frequency                                                             cycle/nsecond                           1.23
    Memory [%]                                                                           %                          65.93
    Duration                                                                       msecond                           1.17
    SOL L2                                                                               %                          19.08
    SOL TEX                                                                              %                          66.19
    SM Active Cycles                                                                 cycle                   1,440,706.40
    SM [%]                                                                               %                          23.56
    ---------------------------------------------------------------------- --------------- ------------------------------

  ptxcall___gpu_transpose_kernel_naive__430_2, 2020-Feb-20 22:42:24, Context 1, Stream 23

==PROF==   0: Naive transpose (1024, 1)
    Section: GPU Speed Of Light
    ---------------------------------------------------------------------- --------------- ------------------------------
    Memory Frequency                                                         cycle/usecond                         877.69
    SOL FB                                                                               %                          22.40
    Elapsed Cycles                                                                   cycle                      2,473,141
    SM Frequency                                                             cycle/nsecond                           1.23
    Memory [%]                                                                           %                          51.17
    Duration                                                                       msecond                           2.00
    SOL L2                                                                               %                          50.17
    SOL TEX                                                                              %                          51.27
    SM Active Cycles                                                                 cycle                   2,465,610.06
    SM [%]                                                                               %                          11.68
    ---------------------------------------------------------------------- --------------- ------------------------------

  ptxcall___gpu_transpose_kernel_naive__430_3, 2020-Feb-20 22:42:28, Context 1, Stream 25

==PROF==   0: Naive transpose (1, 1024)
    Section: GPU Speed Of Light
    ---------------------------------------------------------------------- --------------- ------------------------------
    Memory Frequency                                                         cycle/usecond                         876.69
    SOL FB                                                                               %                          17.88
    Elapsed Cycles                                                                   cycle                      3,737,127
    SM Frequency                                                             cycle/nsecond                           1.24
    Memory [%]                                                                           %                          60.02
    Duration                                                                       msecond                           3.02
    SOL L2                                                                               %                          60.02
    SOL TEX                                                                              %                          45.65
    SM Active Cycles                                                                 cycle                   3,732,591.59
    SM [%]                                                                               %                          12.56
    ---------------------------------------------------------------------- --------------- ------------------------------

Code

# EXCLUDE FROM TESTING
using KernelAbstractions, Test, Random
include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # Load backend
using KernelAbstractions.Extras: @unroll

using NVTX # TODO: Common front-end

const nreps = 3
const N = 2048
const T = Float32

const TILE_DIM = 32
const BLOCK_ROWS = 8

# Simple variants

@kernel function simple_copy_kernel!(output, @Const(input))
    I, J = @index(Global, NTuple)
    @inbounds output[I, J] = input[I, J]
end

@kernel function simple_transpose_kernel!(output, @Const(input))
    I, J = @index(Global, NTuple)
    @inbounds output[J, I] = input[I, J]
end

# Local memory variants

@kernel unsafe_indices = true function lmem_copy_kernel!(
        output, @Const(input),
        ::Val{BANK} = Val(1),
    ) where {BANK}
    I, J = @index(Global, NTuple)
    i, j = @index(Local, NTuple)

    N = @uniform @groupsize()[1]
    M = @uniform @groupsize()[2]

    # +1 to avoid bank conflicts on shared memory
    tile = @localmem eltype(output) (N + BANK, M)

    @inbounds tile[i, j] = input[I, J]

    @synchronize

    @inbounds output[I, J] = tile[i, j]
end

@kernel unsafe_indices = true function lmem_transpose_kernel!(
        output, @Const(input),
        ::Val{BANK} = Val(1),
    ) where {BANK}
    gi, gj = @index(Group, NTuple)
    i, j = @index(Local, NTuple)

    N = @uniform @groupsize()[1]
    M = @uniform @groupsize()[2]

    # +1 to avoid bank conflicts on shared memory
    tile = @localmem eltype(output) (N + BANK, M)

    # Manually calculate global indexes
    # Later on we need to pivot the group index
    I = (gi - 1) * N + i
    J = (gj - 1) * M + j

    @inbounds tile[i, j] = input[I, J]

    @synchronize

    # Pivot the group index
    I = (gj - 1) * M + i
    J = (gi - 1) * N + j

    @inbounds output[I, J] = tile[j, i]
end

# Local Memory + process multiple elements per lane

@kernel unsafe_indices = true function coalesced_copy_kernel!(
        output, @Const(input),
        ::Val{BANK} = Val(1),
    ) where {BANK}
    gi, gj = @index(Group, NTuple)
    i, j = @index(Local, NTuple)

    TILE_DIM = @uniform @groupsize()[1]
    BLOCK_ROWS = @uniform @groupsize()[2]

    # +1 to avoid bank conflicts on shared memory
    tile = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)

    # Can't use @index(Global), because we use a smaller ndrange
    I = (gi - 1) * TILE_DIM + i
    J = (gj - 1) * TILE_DIM + j

    @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
        @inbounds tile[i, j + k] = input[I, J + k]
    end

    @synchronize

    @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
        @inbounds output[I, J + k] = tile[i, j + k]
    end
end

@kernel unsafe_indices = true function coalesced_transpose_kernel!(
        output, @Const(input),
        ::Val{BANK} = Val(1),
    ) where {BANK}
    gi, gj = @index(Group, NTuple)
    i, j = @index(Local, NTuple)

    TILE_DIM = @uniform @groupsize()[1]
    BLOCK_ROWS = @uniform @groupsize()[2]

    # +1 to avoid bank conflicts on shared memory
    tile = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)

    # Can't use @index(Global), because we use a smaller ndrange
    I = (gi - 1) * TILE_DIM + i
    J = (gj - 1) * TILE_DIM + j

    @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
        @inbounds tile[i, j + k] = input[I, J + k]
    end

    @synchronize

    # Transpose block offsets
    I = (gj - 1) * TILE_DIM + i
    J = (gi - 1) * TILE_DIM + j

    @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
        @inbounds output[I, J + k] = tile[j + k, i]
    end
end

# Benchmark simple

for block_dims in ((TILE_DIM, TILE_DIM), (TILE_DIM * TILE_DIM, 1), (1, TILE_DIM * TILE_DIM))
    for (name, kernel) in (
            ("copy", simple_copy_kernel!(backend, block_dims)),
            ("transpose", simple_transpose_kernel!(backend, block_dims)),
        )
        NVTX.@range "Simple $name $block_dims" let
            input = rand!(allocate(backend, T, N, N))
            output = similar(input)

            # compile kernel
            kernel(output, input, ndrange = size(output))
            for rep in 1:nreps
                kernel(output, input, ndrange = size(output))
            end
            KernelAbstractions.synchronize(backend)
        end
    end
end

# Benchmark localmem
for (name, kernel) in (
        ("copy", lmem_copy_kernel!(backend, (TILE_DIM, TILE_DIM))),
        ("transpose", lmem_transpose_kernel!(backend, (TILE_DIM, TILE_DIM))),
    )
    for bank in (true, false)
        NVTX.@range "Localmem $name ($TILE_DIM, $TILE_DIM) bank=$bank" let
            input = rand!(allocate(backend, T, N, N))
            output = similar(input)

            # compile kernel
            kernel(output, input, Val(Int(bank)), ndrange = size(output))
            for rep in 1:nreps
                kernel(output, input, Val(Int(bank)), ndrange = size(output))
            end
            KernelAbstractions.synchronize(backend)
        end
    end
end

# Benchmark localmem + multiple elements per lane
for (name, kernel) in (
        ("copy", coalesced_copy_kernel!(backend, (TILE_DIM, BLOCK_ROWS))),
        ("transpose", coalesced_transpose_kernel!(backend, (TILE_DIM, BLOCK_ROWS))),
    )
    for bank in (true, false)
        NVTX.@range "Localmem + multiple elements $name ($TILE_DIM, $BLOCK_ROWS) bank=$bank" let
            input = rand!(allocate(backend, T, N, N))
            output = similar(input)

            # We want a number of blocks equivalent to (TILE_DIM, TILE_DIM)
            # but our blocks are (TILE_DIM, BLOCK_ROWS) so we need to remove
            # a factor from the size of the array otherwise we get to many blocks
            block_factor = div(TILE_DIM, BLOCK_ROWS)
            ndrange = (N, div(N, block_factor))

            # compile kernel
            kernel(output, input, Val(Int(bank)), ndrange = ndrange)
            for rep in 1:nreps
                kernel(output, input, Val(Int(bank)), ndrange = ndrange)
            end
            KernelAbstractions.synchronize(backend)
        end
    end
end