Custom kernels

Author

Marie-Hélène Burle

There are situations in which CuPy preset functions and routines aren’t sufficient to port your code to the GPU. You will then have to write your own “GPU functions” called kernels.

Example code

Alex Razoumov teaches Numba and Numba-CUDA

On CPU with NumPy:

import numpy as np
import timeit

def primeFactorizationSum(n):
    num = n
    total = 0
    if num % 2 == 0:
        count = 0
        while num % 2 == 0:
            num //= 2
            count += 1
        total += count * 2
    i = 3
    while i * i <= num:
        if num % i == 0:
            count = 0
            while num % i == 0:
                num //= i
                count += 1
            total += count * i
        i += 2
    if num > 1:
        total += num
    return total

n = 1_000_000
A = np.empty(n - 1, dtype=np.int64)

exec_time = timeit.timeit(
    "for i in range(0, n - 1): A[i] = primeFactorizationSum(i + 2)",
    number=20,
    globals=globals()
)

print(f"Execution in {round(exec_time, 3)} s")
print(f"\nPrimes sum array: {A}")
Execution in 108.222 s

Primes sum array: [  2   3   4 ... 287  77  42]

import cupy as cp
import math
from cupyx.profiler import benchmark

def get_small_primes(limit):
    """Simple CPU sieve to get primes up to sqrt(N)"""
    is_prime = [True] * (limit + 1)
    primes = []
    for p in range(2, limit + 1):
        if is_prime[p]:
            primes.append(p)
            for i in range(p * p, limit + 1, p):
                is_prime[i] = False
    return primes

n = 1_000_000

# 1. Get primes up to sqrt(1,000,000) which is 1000.
# There are only 168 primes up to 1000.
small_primes = get_small_primes(int(math.sqrt(n)))

# 2. Initialize the main arrays on the GPU
# nums represents the numbers [2, 3, 4 ... 1,000,000]
nums = cp.arange(2, n + 1, dtype=cp.int64)
totals = cp.zeros(n - 1, dtype=cp.int64)

# 3. Apply broadcasting to divide out the small primes
for p in small_primes:
    # Calculate how many times 'p' could possibly divide into 'n'
    power = p
    max_iter = 0
    while power <= n:
        max_iter += 1
        power *= p

    # Repeatedly divide out the prime and add to total where applicable
    for _ in range(max_iter):
        mask = (nums % p == 0)      # Find all elements divisible by p
        totals[mask] += p           # Add p to their totals
        nums[mask] //= p            # Divide them by p

# 4. Any numbers left in 'nums' greater than 1 are primes themselves
# (e.g. for number 14, after dividing by 2, 'nums' holds 7)
mask = (nums > 1)
totals[mask] += nums[mask]

# Wait for GPU to finish
cp.cuda.Stream.null.synchronize()

# print(benchmark(mse_kernel, (y_cp_true, y_cp_pred), n_repeat=20))

print(f"\nPrimes sum array: {A_cpu}")

General concepts

Inputs and output format: type + name.

Examples:

float32 a   # NumPy data types can be used
T x         # T = generic type

Elementwise kernels

<kernel name> = cp.ElementwiseKernel(
    '<list of inputs>',
    '<list of outputs>',
    '<operation to perform>',
    '<kernel name>'
)

Example:

squared_diff = cp.ElementwiseKernel(
   'float32 x, float32 y',
   'float32 z',
   'z = (x - y) * (x - y)',
   'squared_diff'
)

Reduction kernels

Let’s create a kernel that calculates the mean square error.

This is how you would do this in NumPy:

import numpy as np

# Function to calculate MSE
def mse_fn(y_true, y_pred):
    mse_out = np.mean((y_true - y_pred)**2)
    return mse_out

# Dummy data
y_true = np.array([1.0, 2.5, 3.5, 3.0], dtype=np.float32)
y_pred = np.array([1.5, 2.0, 3.5, 4.0], dtype=np.float32)

# Calculate MSE
mse = mse_fn(y_pred, y_true)

print(f"Predictions: {y_pred}")
print(f"Targets:     {y_true}")
print(f"MSE:         {mse}")
Predictions: [1.5 2.  3.5 4. ]
Targets:     [1.  2.5 3.5 3. ]
MSE:         0.375
mse_kernel = cp.ReductionKernel(
    'T y_pred, T y_true',
    'T mse_out',
    '(y_pred - y_true) * (y_pred - y_true)',
    'a + b',
    'mse_out = a / _in_ind.size()',
    '0',
    'mse_kernel'
)
# Dummy data
y_true = cp.array([1.0, 2.5, 3.5, 3.0], dtype=cp.float32)
y_pred = cp.array([1.5, 2.0, 3.5, 4.0], dtype=cp.float32)

# Calculate MSE
mse = mse_kernel(y_pred, y_true)

print(f"Predictions: {y_pred}")
print(f"Targets:     {y_true}")
print(f"MSE:         {mse}")

Kernel fusion