Measuring performance
Run under nsight-cu
:
nv-nsight-cu-cli --nvtx --profile-from-start=off --section=SpeedOfLight --section=julia --project=examples examples/performance.jl
Results:
Collated results on a V100:
Kernel | Time | Speed of Light Mem % |
---|---|---|
naive (32, 32) | 1.19ms | 65.06% |
naive (1024, 1) | 1.79ms | 56.13 % |
naive (1, 1024) | 3.03ms | 60.02 % |
Full output:
==PROF== 0: Naive transpose (32, 32)
Section: GPU Speed Of Light
---------------------------------------------------------------------- --------------- ------------------------------
Memory Frequency cycle/usecond 878.88
SOL FB % 38.16
Elapsed Cycles cycle 1,447,874
SM Frequency cycle/nsecond 1.23
Memory [%] % 65.93
Duration msecond 1.17
SOL L2 % 19.08
SOL TEX % 66.19
SM Active Cycles cycle 1,440,706.40
SM [%] % 23.56
---------------------------------------------------------------------- --------------- ------------------------------
ptxcall___gpu_transpose_kernel_naive__430_2, 2020-Feb-20 22:42:24, Context 1, Stream 23
==PROF== 0: Naive transpose (1024, 1)
Section: GPU Speed Of Light
---------------------------------------------------------------------- --------------- ------------------------------
Memory Frequency cycle/usecond 877.69
SOL FB % 22.40
Elapsed Cycles cycle 2,473,141
SM Frequency cycle/nsecond 1.23
Memory [%] % 51.17
Duration msecond 2.00
SOL L2 % 50.17
SOL TEX % 51.27
SM Active Cycles cycle 2,465,610.06
SM [%] % 11.68
---------------------------------------------------------------------- --------------- ------------------------------
ptxcall___gpu_transpose_kernel_naive__430_3, 2020-Feb-20 22:42:28, Context 1, Stream 25
==PROF== 0: Naive transpose (1, 1024)
Section: GPU Speed Of Light
---------------------------------------------------------------------- --------------- ------------------------------
Memory Frequency cycle/usecond 876.69
SOL FB % 17.88
Elapsed Cycles cycle 3,737,127
SM Frequency cycle/nsecond 1.24
Memory [%] % 60.02
Duration msecond 3.02
SOL L2 % 60.02
SOL TEX % 45.65
SM Active Cycles cycle 3,732,591.59
SM [%] % 12.56
---------------------------------------------------------------------- --------------- ------------------------------
Code
# EXCLUDE FROM TESTING
using KernelAbstractions, Test, Random
include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # Load backend
using KernelAbstractions.Extras: @unroll
using NVTX # TODO: Common front-end
const nreps = 3
const N = 2048
const T = Float32
const TILE_DIM = 32
const BLOCK_ROWS = 8
# Simple variants
@kernel function simple_copy_kernel!(output, @Const(input))
I, J = @index(Global, NTuple)
@inbounds output[I, J] = input[I, J]
end
@kernel function simple_transpose_kernel!(output, @Const(input))
I, J = @index(Global, NTuple)
@inbounds output[J, I] = input[I, J]
end
# Local memory variants
@kernel function lmem_copy_kernel!(
output, @Const(input),
::Val{BANK} = Val(1),
) where {BANK}
I, J = @index(Global, NTuple)
i, j = @index(Local, NTuple)
N = @uniform @groupsize()[1]
M = @uniform @groupsize()[2]
# +1 to avoid bank conflicts on shared memory
tile = @localmem eltype(output) (N + BANK, M)
@inbounds tile[i, j] = input[I, J]
@synchronize
@inbounds output[I, J] = tile[i, j]
end
@kernel function lmem_transpose_kernel!(
output, @Const(input),
::Val{BANK} = Val(1),
) where {BANK}
gi, gj = @index(Group, NTuple)
i, j = @index(Local, NTuple)
N = @uniform @groupsize()[1]
M = @uniform @groupsize()[2]
# +1 to avoid bank conflicts on shared memory
tile = @localmem eltype(output) (N + BANK, M)
# Manually calculate global indexes
# Later on we need to pivot the group index
I = (gi - 1) * N + i
J = (gj - 1) * M + j
@inbounds tile[i, j] = input[I, J]
@synchronize
# Pivot the group index
I = (gj - 1) * M + i
J = (gi - 1) * N + j
@inbounds output[I, J] = tile[j, i]
end
# Local Memory + process multiple elements per lane
@kernel function coalesced_copy_kernel!(
output, @Const(input),
::Val{BANK} = Val(1),
) where {BANK}
gi, gj = @index(Group, NTuple)
i, j = @index(Local, NTuple)
TILE_DIM = @uniform @groupsize()[1]
BLOCK_ROWS = @uniform @groupsize()[2]
# +1 to avoid bank conflicts on shared memory
tile = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)
# Can't use @index(Global), because we use a smaller ndrange
I = (gi - 1) * TILE_DIM + i
J = (gj - 1) * TILE_DIM + j
@unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
@inbounds tile[i, j + k] = input[I, J + k]
end
@synchronize
@unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
@inbounds output[I, J + k] = tile[i, j + k]
end
end
@kernel function coalesced_transpose_kernel!(
output, @Const(input),
::Val{BANK} = Val(1),
) where {BANK}
gi, gj = @index(Group, NTuple)
i, j = @index(Local, NTuple)
TILE_DIM = @uniform @groupsize()[1]
BLOCK_ROWS = @uniform @groupsize()[2]
# +1 to avoid bank conflicts on shared memory
tile = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)
# Can't use @index(Global), because we use a smaller ndrange
I = (gi - 1) * TILE_DIM + i
J = (gj - 1) * TILE_DIM + j
@unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
@inbounds tile[i, j + k] = input[I, J + k]
end
@synchronize
# Transpose block offsets
I = (gj - 1) * TILE_DIM + i
J = (gi - 1) * TILE_DIM + j
@unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
@inbounds output[I, J + k] = tile[j + k, i]
end
end
# Benchmark simple
for block_dims in ((TILE_DIM, TILE_DIM), (TILE_DIM * TILE_DIM, 1), (1, TILE_DIM * TILE_DIM))
for (name, kernel) in (
("copy", simple_copy_kernel!(backend, block_dims)),
("transpose", simple_transpose_kernel!(backend, block_dims)),
)
NVTX.@range "Simple $name $block_dims" let
input = rand!(allocate(backend, T, N, N))
output = similar(input)
# compile kernel
kernel(output, input, ndrange = size(output))
for rep in 1:nreps
kernel(output, input, ndrange = size(output))
end
KernelAbstractions.synchronize(backend)
end
end
end
# Benchmark localmem
for (name, kernel) in (
("copy", lmem_copy_kernel!(backend, (TILE_DIM, TILE_DIM))),
("transpose", lmem_transpose_kernel!(backend, (TILE_DIM, TILE_DIM))),
)
for bank in (true, false)
NVTX.@range "Localmem $name ($TILE_DIM, $TILE_DIM) bank=$bank" let
input = rand!(allocate(backend, T, N, N))
output = similar(input)
# compile kernel
kernel(output, input, Val(Int(bank)), ndrange = size(output))
for rep in 1:nreps
kernel(output, input, Val(Int(bank)), ndrange = size(output))
end
KernelAbstractions.synchronize(backend)
end
end
end
# Benchmark localmem + multiple elements per lane
for (name, kernel) in (
("copy", coalesced_copy_kernel!(backend, (TILE_DIM, BLOCK_ROWS))),
("transpose", coalesced_transpose_kernel!(backend, (TILE_DIM, BLOCK_ROWS))),
)
for bank in (true, false)
NVTX.@range "Localmem + multiple elements $name ($TILE_DIM, $BLOCK_ROWS) bank=$bank" let
input = rand!(allocate(backend, T, N, N))
output = similar(input)
# We want a number of blocks equivalent to (TILE_DIM, TILE_DIM)
# but our blocks are (TILE_DIM, BLOCK_ROWS) so we need to remove
# a factor from the size of the array otherwise we get to many blocks
block_factor = div(TILE_DIM, BLOCK_ROWS)
ndrange = (N, div(N, block_factor))
# compile kernel
kernel(output, input, Val(Int(bank)), ndrange = ndrange)
for rep in 1:nreps
kernel(output, input, Val(Int(bank)), ndrange = ndrange)
end
KernelAbstractions.synchronize(backend)
end
end
end