Custom kernels unused bits

Author

Marie-Hélène Burle

Option 2:

@cp.fuse()
def primeFactorizationSum(n):
    num = n
    total = 0
    if num % 2 == 0:
        count = 0
        while num % 2 == 0:
            num //= 2
            count += 1
        total += count * 2
    i = 3
    while i * i <= num:
        if num % i == 0:
            count = 0
            while num % i == 0:
                num //= i
                count += 1
            total += count * i
        i += 2
    if num > 1:
        total += num
    return total


n = 1_000_000
A = cp.empty(n - 1, dtype=cp.int64)


exec_time = timeit.timeit(
    "for i in range(0, n - 1): A[i] = primeFactorizationSum(i + 2)",
    number=20,
    globals=globals()
)

print(benchmark(primeFactorizationSum, (n,), n_repeat=20))

print(f"Execution in {round(exec_time, 3)} s")
print(f"\nPrimes sum array: {A}")

General concepts

Inputs and output format: type + name.

Examples:

float32 a   # NumPy data types can be used
T x         # T = generic type

Elementwise kernels

<kernel name> = cp.ElementwiseKernel(
    '<list of inputs>',
    '<list of outputs>',
    '<operation to perform>',
    '<kernel name>'
)

Example:

squared_diff = cp.ElementwiseKernel(
   'float32 x, float32 y',
   'float32 z',
   'z = (x - y) * (x - y)',
   'squared_diff'
)

Reduction kernels

Let’s create a kernel that calculates the mean square error.

This is how you would do this in NumPy:

import numpy as np

# Function to calculate MSE
def mse_fn(y_true, y_pred):
    mse_out = np.mean((y_true - y_pred)**2)
    return mse_out

# Dummy data
y_true = np.array([1.0, 2.5, 3.5, 3.0], dtype=np.float32)
y_pred = np.array([1.5, 2.0, 3.5, 4.0], dtype=np.float32)

# Calculate MSE
mse = mse_fn(y_pred, y_true)

print(f"Predictions: {y_pred}")
print(f"Targets:     {y_true}")
print(f"MSE:         {mse}")
Predictions: [1.5 2.  3.5 4. ]
Targets:     [1.  2.5 3.5 3. ]
MSE:         0.375
mse_kernel = cp.ReductionKernel(
    'T y_pred, T y_true',
    'T mse_out',
    '(y_pred - y_true) * (y_pred - y_true)',
    'a + b',
    'mse_out = a / _in_ind.size()',
    '0',
    'mse_kernel'
)
# Dummy data
y_true = cp.array([1.0, 2.5, 3.5, 3.0], dtype=cp.float32)
y_pred = cp.array([1.5, 2.0, 3.5, 4.0], dtype=cp.float32)

# Calculate MSE
mse = mse_kernel(y_pred, y_true)

print(f"Predictions: {y_pred}")
print(f"Targets:     {y_true}")
print(f"MSE:         {mse}")
prime_cpu.sh
#!/bin/bash
#SBATCH --time=5               # min
#SBATCH --mem=2048             # MB
#SBATCH --gpus=2g.10gb:1       # 1 MIG