Macros | |
#define | CUDA_KERNEL_LOOP(i, n) |
#define | swap(a0, a1, j, m) t = (a0 ^ (a1 >>j)) & m; a0 = a0 ^ t; a1 = a1 ^ (t << j); |
Functions | |
template<typename T > | |
__device__ uint32_t | __ballot_custom (T val) |
template<typename T1 , typename T2 > | |
__device__ T1 | __shfl_custom (T1 val, T2 lane) |
int | CAFFE_GET_BLOCKS (const int N) |
void | convolve_bin_gpu (float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad, int new_lda, float *mean_arr_gpu) |
__global__ void | convolve_bin_gpu_kernel (float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad, int new_lda, float *mean_arr_gpu) |
void | convolve_gpu (float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad) |
__global__ void | convolve_gpu_kernel (float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad) |
void | fill_int8_gpu (unsigned char *src, unsigned char val, size_t size) |
__global__ void | fill_int8_gpu_kernel (unsigned char *src, unsigned char val, size_t size) |
void | float_to_bit_gpu (float *src, unsigned char *dst, size_t size) |
__global__ void | float_to_bit_gpu_kernel (float *src, unsigned char *dst, size_t size) |
void | gemm_nn_custom_bin_mean_transposed_gpu (int M, int N, int K, unsigned char *A, int lda, unsigned char *B, int ldb, float *C, int ldc, float *mean_arr, float *bias, int leaky_activation, float *shortcut_in_gpu, float *shortcut_out_gpu) |
__global__ void | gemm_nn_custom_bin_mean_transposed_gpu_kernel (int M, int N, int K, unsigned char *A, int lda, unsigned char *B, int ldb, float *C, int ldc, float *mean_arr, float *bias_arr, int leaky_activation, float *shortcut_in_gpu, float *shortcut_out_gpu) |
__device__ static __host__ unsigned char | get_bit (unsigned char const *const src, size_t index) |
__global__ void | im2col_align_bin_gpu_kernel (const int n, const float *data_im, const int height, const int width, const int ksize, const int channels, const int pad, const int stride, const int height_col, const int width_col, float *data_col, const int bit_align) |
void | im2col_align_bin_ongpu (float *im, int channels, int height, int width, int ksize, int stride, int pad, float *data_col, int bit_align) |
__global__ void | im2col_align_gpu_kernel (const int n, const float *data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, float *data_col, const int bit_align) |
void | im2col_align_ongpu (float *im, int channels, int height, int width, int ksize, int stride, int pad, float *data_col, int bit_align) |
void | im2col_gpu_ext (const float *data_im, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, float *data_col) |
__global__ void | im2col_gpu_kernel (const int n, const float *data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, float *data_col) |
__global__ void | im2col_gpu_kernel_ext (const int n, const float *data_im, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int height_col, const int width_col, float *data_col) |
void | im2col_ongpu (float *im, int channels, int height, int width, int ksize, int stride, int pad, float *data_col) |
void | repack_input_gpu (float *input, float *re_packed_input, int w, int h, int c) |
void | repack_input_gpu_bin (float *input, uint32_t *re_packed_input_bin, int w, int h, int c) |
__global__ void | repack_input_kernel (float *input, float *re_packed_input, int w, int h, int c) |
__global__ void | repack_input_kernel_2 (float *input, float *re_packed_input, int w, int h, int c) |
__global__ void | repack_input_kernel_bin (float *input, uint32_t *re_packed_input_bin, int w, int h, int c) |
__device__ uint32_t | reverse_32_bit (uint32_t a) |
__device__ __host__ uint8_t | reverse_8_bit (uint8_t a) |
__device__ __host__ unsigned char | reverse_byte (unsigned char a) |
__device__ __host__ unsigned char | reverse_byte_2 (unsigned char a) |
__device__ unsigned char | reverse_byte_CUDA (unsigned char a) |
__device__ void | transpose32_optimized (uint32_t A[32]) |
__device__ void | transpose8rS32_reversed_diagonale (unsigned char *A, unsigned char *B, int m, int n) |
__device__ void | transpose_32x32_bits_reversed_diagonale (uint32_t *A, uint32_t *B, int m, int n) |
void | transpose_bin_gpu (unsigned char *A, unsigned char *B, const int n, const int m, const int lda, const int ldb, const int block_size) |
__global__ void | transpose_bin_gpu_kernel (unsigned char *A, unsigned char *B, const int n, const int m, const int lda, const int ldb, const int block_size) |
__global__ void | transpose_bin_gpu_kernel_32 (uint32_t *A, uint32_t *B, const int n, const int m, const int lda, const int ldb, const int block_size) |
void | transpose_uint32_gpu (uint32_t *src, uint32_t *dst, int src_h, int src_w, int src_align, int dst_align) |
__global__ void | transpose_uint32_kernel (uint32_t *src, uint32_t *dst, int src_h, int src_w, int src_align, int dst_align) |
__global__ void | transpose_uint32_kernel_2 (uint32_t *src, uint32_t *dst, int src_h, int src_w, int src_align, int dst_align) |
__inline__ __device__ int | warpAllReduceSum (int val) |
__device__ static __host__ uint8_t | xnor_bit1 (uint8_t a, uint8_t b) |
__device__ static __host__ ulonglong4 | xor_int256 (ulonglong4 a, ulonglong4 b) |
__device__ static __host__ uint32_t | xor_int32 (uint32_t a, uint32_t b) |
__device__ static __host__ uint64_t | xor_int64 (uint64_t a, uint64_t b) |
Variables | |
const int | CAFFE_CUDA_NUM_THREADS = 512 |
#define CUDA_KERNEL_LOOP | ( | i, | |
n | |||
) |
#define swap | ( | a0, | |
a1, | |||
j, | |||
m | |||
) | t = (a0 ^ (a1 >>j)) & m; a0 = a0 ^ t; a1 = a1 ^ (t << j); |
|
inline |
|
inline |
|
inline |
void convolve_bin_gpu | ( | float * | input, |
float * | weights, | ||
float * | output, | ||
int | in_w, | ||
int | in_h, | ||
int | in_c, | ||
int | n, | ||
int | size, | ||
int | pad, | ||
int | new_lda, | ||
float * | mean_arr_gpu | ||
) |
__global__ void convolve_bin_gpu_kernel | ( | float * | input, |
float * | weights, | ||
float * | output, | ||
int | in_w, | ||
int | in_h, | ||
int | in_c, | ||
int | n, | ||
int | size, | ||
int | pad, | ||
int | new_lda, | ||
float * | mean_arr_gpu | ||
) |
void convolve_gpu | ( | float * | input, |
float * | weights, | ||
float * | output, | ||
int | in_w, | ||
int | in_h, | ||
int | in_c, | ||
int | n, | ||
int | size, | ||
int | pad | ||
) |
__global__ void convolve_gpu_kernel | ( | float * | input, |
float * | weights, | ||
float * | output, | ||
int | in_w, | ||
int | in_h, | ||
int | in_c, | ||
int | n, | ||
int | size, | ||
int | pad | ||
) |
void fill_int8_gpu | ( | unsigned char * | src, |
unsigned char | val, | ||
size_t | size | ||
) |
__global__ void fill_int8_gpu_kernel | ( | unsigned char * | src, |
unsigned char | val, | ||
size_t | size | ||
) |
void float_to_bit_gpu | ( | float * | src, |
unsigned char * | dst, | ||
size_t | size | ||
) |
__global__ void float_to_bit_gpu_kernel | ( | float * | src, |
unsigned char * | dst, | ||
size_t | size | ||
) |
void gemm_nn_custom_bin_mean_transposed_gpu | ( | int | M, |
int | N, | ||
int | K, | ||
unsigned char * | A, | ||
int | lda, | ||
unsigned char * | B, | ||
int | ldb, | ||
float * | C, | ||
int | ldc, | ||
float * | mean_arr, | ||
float * | bias, | ||
int | leaky_activation, | ||
float * | shortcut_in_gpu, | ||
float * | shortcut_out_gpu | ||
) |
__global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel | ( | int | M, |
int | N, | ||
int | K, | ||
unsigned char * | A, | ||
int | lda, | ||
unsigned char * | B, | ||
int | ldb, | ||
float * | C, | ||
int | ldc, | ||
float * | mean_arr, | ||
float * | bias_arr, | ||
int | leaky_activation, | ||
float * | shortcut_in_gpu, | ||
float * | shortcut_out_gpu | ||
) |
|
inlinestatic |
__global__ void im2col_align_bin_gpu_kernel | ( | const int | n, |
const float * | data_im, | ||
const int | height, | ||
const int | width, | ||
const int | ksize, | ||
const int | channels, | ||
const int | pad, | ||
const int | stride, | ||
const int | height_col, | ||
const int | width_col, | ||
float * | data_col, | ||
const int | bit_align | ||
) |
void im2col_align_bin_ongpu | ( | float * | im, |
int | channels, | ||
int | height, | ||
int | width, | ||
int | ksize, | ||
int | stride, | ||
int | pad, | ||
float * | data_col, | ||
int | bit_align | ||
) |
__global__ void im2col_align_gpu_kernel | ( | const int | n, |
const float * | data_im, | ||
const int | height, | ||
const int | width, | ||
const int | ksize, | ||
const int | pad, | ||
const int | stride, | ||
const int | height_col, | ||
const int | width_col, | ||
float * | data_col, | ||
const int | bit_align | ||
) |
void im2col_align_ongpu | ( | float * | im, |
int | channels, | ||
int | height, | ||
int | width, | ||
int | ksize, | ||
int | stride, | ||
int | pad, | ||
float * | data_col, | ||
int | bit_align | ||
) |
void im2col_gpu_ext | ( | const float * | data_im, |
const int | channels, | ||
const int | height, | ||
const int | width, | ||
const int | kernel_h, | ||
const int | kernel_w, | ||
const int | pad_h, | ||
const int | pad_w, | ||
const int | stride_h, | ||
const int | stride_w, | ||
const int | dilation_h, | ||
const int | dilation_w, | ||
float * | data_col | ||
) |
__global__ void im2col_gpu_kernel | ( | const int | n, |
const float * | data_im, | ||
const int | height, | ||
const int | width, | ||
const int | ksize, | ||
const int | pad, | ||
const int | stride, | ||
const int | height_col, | ||
const int | width_col, | ||
float * | data_col | ||
) |
__global__ void im2col_gpu_kernel_ext | ( | const int | n, |
const float * | data_im, | ||
const int | height, | ||
const int | width, | ||
const int | kernel_h, | ||
const int | kernel_w, | ||
const int | pad_h, | ||
const int | pad_w, | ||
const int | stride_h, | ||
const int | stride_w, | ||
const int | dilation_h, | ||
const int | dilation_w, | ||
const int | height_col, | ||
const int | width_col, | ||
float * | data_col | ||
) |
void im2col_ongpu | ( | float * | im, |
int | channels, | ||
int | height, | ||
int | width, | ||
int | ksize, | ||
int | stride, | ||
int | pad, | ||
float * | data_col | ||
) |
void repack_input_gpu | ( | float * | input, |
float * | re_packed_input, | ||
int | w, | ||
int | h, | ||
int | c | ||
) |
void repack_input_gpu_bin | ( | float * | input, |
uint32_t * | re_packed_input_bin, | ||
int | w, | ||
int | h, | ||
int | c | ||
) |
__global__ void repack_input_kernel | ( | float * | input, |
float * | re_packed_input, | ||
int | w, | ||
int | h, | ||
int | c | ||
) |
__global__ void repack_input_kernel_2 | ( | float * | input, |
float * | re_packed_input, | ||
int | w, | ||
int | h, | ||
int | c | ||
) |
__global__ void repack_input_kernel_bin | ( | float * | input, |
uint32_t * | re_packed_input_bin, | ||
int | w, | ||
int | h, | ||
int | c | ||
) |
__device__ uint32_t reverse_32_bit | ( | uint32_t | a | ) |
__device__ __host__ uint8_t reverse_8_bit | ( | uint8_t | a | ) |
__device__ __host__ unsigned char reverse_byte | ( | unsigned char | a | ) |
__device__ __host__ unsigned char reverse_byte_2 | ( | unsigned char | a | ) |
__device__ unsigned char reverse_byte_CUDA | ( | unsigned char | a | ) |
__device__ void transpose32_optimized | ( | uint32_t | A[32] | ) |
__device__ void transpose8rS32_reversed_diagonale | ( | unsigned char * | A, |
unsigned char * | B, | ||
int | m, | ||
int | n | ||
) |
__device__ void transpose_32x32_bits_reversed_diagonale | ( | uint32_t * | A, |
uint32_t * | B, | ||
int | m, | ||
int | n | ||
) |
void transpose_bin_gpu | ( | unsigned char * | A, |
unsigned char * | B, | ||
const int | n, | ||
const int | m, | ||
const int | lda, | ||
const int | ldb, | ||
const int | block_size | ||
) |
__global__ void transpose_bin_gpu_kernel | ( | unsigned char * | A, |
unsigned char * | B, | ||
const int | n, | ||
const int | m, | ||
const int | lda, | ||
const int | ldb, | ||
const int | block_size | ||
) |
__global__ void transpose_bin_gpu_kernel_32 | ( | uint32_t * | A, |
uint32_t * | B, | ||
const int | n, | ||
const int | m, | ||
const int | lda, | ||
const int | ldb, | ||
const int | block_size | ||
) |
void transpose_uint32_gpu | ( | uint32_t * | src, |
uint32_t * | dst, | ||
int | src_h, | ||
int | src_w, | ||
int | src_align, | ||
int | dst_align | ||
) |
__global__ void transpose_uint32_kernel | ( | uint32_t * | src, |
uint32_t * | dst, | ||
int | src_h, | ||
int | src_w, | ||
int | src_align, | ||
int | dst_align | ||
) |
__global__ void transpose_uint32_kernel_2 | ( | uint32_t * | src, |
uint32_t * | dst, | ||
int | src_h, | ||
int | src_w, | ||
int | src_align, | ||
int | dst_align | ||
) |
__inline__ __device__ int warpAllReduceSum | ( | int | val | ) |
|
inlinestatic |
|
inlinestatic |
|
inlinestatic |
|
inlinestatic |
const int CAFFE_CUDA_NUM_THREADS = 512 |