|
|
|
#define WARP_SIZE 32 |
|
#define FULL_MASK 0xffffffff |
|
#define OPTIMAL_THREADS 256 |
|
|
|
__global__ void index_max_cuda_kernel( |
|
float *index_vals, |
|
int *indices, |
|
float *max_vals, |
|
float *max_vals_scatter, |
|
long batch_size, |
|
long A_num_block, |
|
long B_num_block, |
|
long num_block |
|
); |
|
|
|
__global__ void mm_to_sparse_cuda_kernel( |
|
float *dense_A, |
|
float *dense_B, |
|
int *indices, |
|
float *sparse_C, |
|
long batch_size, |
|
long A_num_block, |
|
long B_num_block, |
|
long dim, |
|
long num_block |
|
); |
|
|
|
__global__ void sparse_dense_mm_cuda_kernel( |
|
float *sparse_A, |
|
int *indices, |
|
float *dense_B, |
|
float *dense_C, |
|
long batch_size, |
|
long A_num_block, |
|
long B_num_block, |
|
long dim, |
|
long num_block |
|
); |
|
|
|
__global__ void reduce_sum_cuda_kernel( |
|
float *sparse_A, |
|
int *indices, |
|
float *dense_C, |
|
long batch_size, |
|
long A_num_block, |
|
long B_num_block, |
|
long num_block |
|
); |
|
|
|
__global__ void scatter_cuda_kernel( |
|
float *dense_A, |
|
int *indices, |
|
float *sparse_C, |
|
long batch_size, |
|
long A_num_block, |
|
long B_num_block, |
|
long num_block |
|
); |
|
|