There are situations in which CuPy preset functions and routines aren’t sufficient to port your code to the GPU. You will then have to write your own “GPU functions” called kernels.
import numpy as npimport timeitdef primeFactorizationSum(n): num = n total =0if num %2==0: count =0while num %2==0: num //=2 count +=1 total += count *2 i =3while i * i <= num:if num % i ==0: count =0while num % i ==0: num //= i count +=1 total += count * i i +=2if num >1: total += numreturn totaln =1_000_000A = np.empty(n -1, dtype=np.int64)exec_time = timeit.timeit("for i in range(0, n - 1): A[i] = primeFactorizationSum(i + 2)", number=20,globals=globals())print(f"Execution in {round(exec_time, 3)} s")print(f"\nPrimes sum array: {A}")
Execution in 108.222 s
Primes sum array: [ 2 3 4 ... 287 77 42]
import cupy as cpimport mathfrom cupyx.profiler import benchmarkdef get_small_primes(limit):"""Simple CPU sieve to get primes up to sqrt(N)""" is_prime = [True] * (limit +1) primes = []for p inrange(2, limit +1):if is_prime[p]: primes.append(p)for i inrange(p * p, limit +1, p): is_prime[i] =Falsereturn primesn =1_000_000# 1. Get primes up to sqrt(1,000,000) which is 1000.# There are only 168 primes up to 1000.small_primes = get_small_primes(int(math.sqrt(n)))# 2. Initialize the main arrays on the GPU# nums represents the numbers [2, 3, 4 ... 1,000,000]nums = cp.arange(2, n +1, dtype=cp.int64)totals = cp.zeros(n -1, dtype=cp.int64)# 3. Apply broadcasting to divide out the small primesfor p in small_primes:# Calculate how many times 'p' could possibly divide into 'n' power = p max_iter =0while power <= n: max_iter +=1 power *= p# Repeatedly divide out the prime and add to total where applicablefor _ inrange(max_iter): mask = (nums % p ==0) # Find all elements divisible by p totals[mask] += p # Add p to their totals nums[mask] //= p # Divide them by p# 4. Any numbers left in 'nums' greater than 1 are primes themselves# (e.g. for number 14, after dividing by 2, 'nums' holds 7)mask = (nums >1)totals[mask] += nums[mask]# Wait for GPU to finishcp.cuda.Stream.null.synchronize()# print(benchmark(mse_kernel, (y_cp_true, y_cp_pred), n_repeat=20))print(f"\nPrimes sum array: {A_cpu}")
General concepts
Inputs and output format: type + name.
Examples:
float32 a # NumPy data types can be usedT x # T = generic type
Elementwise kernels
<kernel name> = cp.ElementwiseKernel(
'<list of inputs>',
'<list of outputs>',
'<operation to perform>',
'<kernel name>'
)