General matrix multiplication (GEMM) More...
Macros | |
#define | PUT_IN_REGISTER register |
#define | TILE_K 16 |
#define | TILE_M 4 |
#define | TILE_N 16 |
Functions | |
void | activate_array_cpu_custom (float *x, const int n, const ACTIVATION a) |
void | binary_int32_printf (uint32_t src) |
void | binary_int64_printf (uint64_t src) |
void | convolution_2d (int w, int h, int ksize, int n, int c, int pad, int stride, float *weights, float *input, float *output, float *mean) |
void | convolution_repacked (uint32_t *packed_input, uint32_t *packed_weights, float *output, int w, int h, int c, int n, int size, int pad, int new_lda, float *mean_arr) |
static uint32_t | fill_bit_int32 (char src) |
static uint64_t | fill_bit_int64 (char src) |
void | float_to_bit (float *src, unsigned char *dst, size_t size) |
void | forward_maxpool_layer_avx (float *src, float *dst, int *indexes, int size, int w, int h, int out_w, int out_h, int c, int pad, int stride, int batch) |
void | gemm (int TA, int TB, int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float BETA, float *C, int ldc) |
void | gemm_bin (int M, int N, int K, float ALPHA, char *A, int lda, float *B, int ldb, float *C, int ldc) |
void | gemm_cpu (int TA, int TB, int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float BETA, float *C, int ldc) |
void | gemm_gpu (int TA, int TB, int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float BETA, float *C, int ldc) |
void | gemm_nn (int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float *C, int ldc) |
void | gemm_nn_bin_32bit_packed (int M, int N, int K, float ALPHA, uint32_t *A, int lda, uint32_t *B, int ldb, float *C, int ldc, float *mean_arr) |
void | gemm_nn_bin_transposed_32bit_packed (int M, int N, int K, float ALPHA, uint32_t *A, int lda, uint32_t *B, int ldb, float *C, int ldc, float *mean_arr) |
void | gemm_nn_custom_bin_mean_transposed (int M, int N, int K, float ALPHA_UNUSED, unsigned char *A, int lda, unsigned char *B, int ldb, float *C, int ldc, float *mean_arr) |
void | gemm_nn_fast (int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float *C, int ldc) |
void | gemm_nt (int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float *C, int ldc) |
void | gemm_ongpu (int TA, int TB, int M, int N, int K, float ALPHA, float *A_gpu, int lda, float *B_gpu, int ldb, float BETA, float *C_gpu, int ldc) |
void | gemm_tn (int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float *C, int ldc) |
void | gemm_tt (int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float *C, int ldc) |
static uint32_t | get_bit_int32 (uint32_t const *const src, size_t index) |
void | im2col_cpu_custom (float *data_im, int channels, int height, int width, int ksize, int stride, int pad, float *data_col) |
void | im2col_cpu_custom_bin (float *data_im, int channels, int height, int width, int ksize, int stride, int pad, float *data_col, int bit_align) |
void | im2col_cpu_custom_transpose (float *data_im, int channels, int height, int width, int ksize, int stride, int pad, float *data_col, int ldb_align) |
void | init_cpu () |
int | is_avx () |
int | is_fma_avx2 () |
float * | random_matrix (int rows, int cols) |
void | repack_input (float *input, float *re_packed_input, int w, int h, int c) |
void | test_gpu_accuracy (int TA, int TB, int m, int k, int n) |
int | test_gpu_blas () |
void | time_gpu_random_matrix (int TA, int TB, int m, int k, int n) |
void | time_ongpu (int TA, int TB, int m, int k, int n) |
void | time_random_matrix (int TA, int TB, int m, int k, int n) |
void | transpose_bin (uint32_t *A, uint32_t *B, const int n, const int m, const int lda, const int ldb, const int block_size) |
void | transpose_block_SSE4x4 (float *A, float *B, const int n, const int m, const int lda, const int ldb, const int block_size) |
static void | transpose_scalar_block (float *A, float *B, const int lda, const int ldb, const int block_size) |
void | transpose_uint32 (uint32_t *src, uint32_t *dst, int src_h, int src_w, int src_align, int dst_align) |
static unsigned char | xnor (unsigned char a, unsigned char b) |
static uint32_t | xnor_int32 (uint32_t a, uint32_t b) |
static uint64_t | xnor_int64 (uint64_t a, uint64_t b) |
General matrix multiplication (GEMM)
#define PUT_IN_REGISTER register |
#define TILE_K 16 |
#define TILE_M 4 |
#define TILE_N 16 |
void activate_array_cpu_custom | ( | float * | x, |
const int | n, | ||
const ACTIVATION | a | ||
) |
void binary_int32_printf | ( | uint32_t | src | ) |
void binary_int64_printf | ( | uint64_t | src | ) |
void convolution_2d | ( | int | w, |
int | h, | ||
int | ksize, | ||
int | n, | ||
int | c, | ||
int | pad, | ||
int | stride, | ||
float * | weights, | ||
float * | input, | ||
float * | output, | ||
float * | mean | ||
) |
void convolution_repacked | ( | uint32_t * | packed_input, |
uint32_t * | packed_weights, | ||
float * | output, | ||
int | w, | ||
int | h, | ||
int | c, | ||
int | n, | ||
int | size, | ||
int | pad, | ||
int | new_lda, | ||
float * | mean_arr | ||
) |
|
inlinestatic |
|
inlinestatic |
void float_to_bit | ( | float * | src, |
unsigned char * | dst, | ||
size_t | size | ||
) |
void forward_maxpool_layer_avx | ( | float * | src, |
float * | dst, | ||
int * | indexes, | ||
int | size, | ||
int | w, | ||
int | h, | ||
int | out_w, | ||
int | out_h, | ||
int | c, | ||
int | pad, | ||
int | stride, | ||
int | batch | ||
) |
void gemm | ( | int | TA, |
int | TB, | ||
int | M, | ||
int | N, | ||
int | K, | ||
float | ALPHA, | ||
float * | A, | ||
int | lda, | ||
float * | B, | ||
int | ldb, | ||
float | BETA, | ||
float * | C, | ||
int | ldc | ||
) |
void gemm_bin | ( | int | M, |
int | N, | ||
int | K, | ||
float | ALPHA, | ||
char * | A, | ||
int | lda, | ||
float * | B, | ||
int | ldb, | ||
float * | C, | ||
int | ldc | ||
) |
void gemm_cpu | ( | int | TA, |
int | TB, | ||
int | M, | ||
int | N, | ||
int | K, | ||
float | ALPHA, | ||
float * | A, | ||
int | lda, | ||
float * | B, | ||
int | ldb, | ||
float | BETA, | ||
float * | C, | ||
int | ldc | ||
) |
void gemm_gpu | ( | int | TA, |
int | TB, | ||
int | M, | ||
int | N, | ||
int | K, | ||
float | ALPHA, | ||
float * | A, | ||
int | lda, | ||
float * | B, | ||
int | ldb, | ||
float | BETA, | ||
float * | C, | ||
int | ldc | ||
) |
void gemm_nn | ( | int | M, |
int | N, | ||
int | K, | ||
float | ALPHA, | ||
float * | A, | ||
int | lda, | ||
float * | B, | ||
int | ldb, | ||
float * | C, | ||
int | ldc | ||
) |
void gemm_nn_bin_32bit_packed | ( | int | M, |
int | N, | ||
int | K, | ||
float | ALPHA, | ||
uint32_t * | A, | ||
int | lda, | ||
uint32_t * | B, | ||
int | ldb, | ||
float * | C, | ||
int | ldc, | ||
float * | mean_arr | ||
) |
void gemm_nn_bin_transposed_32bit_packed | ( | int | M, |
int | N, | ||
int | K, | ||
float | ALPHA, | ||
uint32_t * | A, | ||
int | lda, | ||
uint32_t * | B, | ||
int | ldb, | ||
float * | C, | ||
int | ldc, | ||
float * | mean_arr | ||
) |
void gemm_nn_custom_bin_mean_transposed | ( | int | M, |
int | N, | ||
int | K, | ||
float | ALPHA_UNUSED, | ||
unsigned char * | A, | ||
int | lda, | ||
unsigned char * | B, | ||
int | ldb, | ||
float * | C, | ||
int | ldc, | ||
float * | mean_arr | ||
) |
void gemm_nn_fast | ( | int | M, |
int | N, | ||
int | K, | ||
float | ALPHA, | ||
float * | A, | ||
int | lda, | ||
float * | B, | ||
int | ldb, | ||
float * | C, | ||
int | ldc | ||
) |
void gemm_nt | ( | int | M, |
int | N, | ||
int | K, | ||
float | ALPHA, | ||
float * | A, | ||
int | lda, | ||
float * | B, | ||
int | ldb, | ||
float * | C, | ||
int | ldc | ||
) |
void gemm_ongpu | ( | int | TA, |
int | TB, | ||
int | M, | ||
int | N, | ||
int | K, | ||
float | ALPHA, | ||
float * | A_gpu, | ||
int | lda, | ||
float * | B_gpu, | ||
int | ldb, | ||
float | BETA, | ||
float * | C_gpu, | ||
int | ldc | ||
) |
void gemm_tn | ( | int | M, |
int | N, | ||
int | K, | ||
float | ALPHA, | ||
float * | A, | ||
int | lda, | ||
float * | B, | ||
int | ldb, | ||
float * | C, | ||
int | ldc | ||
) |
void gemm_tt | ( | int | M, |
int | N, | ||
int | K, | ||
float | ALPHA, | ||
float * | A, | ||
int | lda, | ||
float * | B, | ||
int | ldb, | ||
float * | C, | ||
int | ldc | ||
) |
|
inlinestatic |
void im2col_cpu_custom | ( | float * | data_im, |
int | channels, | ||
int | height, | ||
int | width, | ||
int | ksize, | ||
int | stride, | ||
int | pad, | ||
float * | data_col | ||
) |
void im2col_cpu_custom_bin | ( | float * | data_im, |
int | channels, | ||
int | height, | ||
int | width, | ||
int | ksize, | ||
int | stride, | ||
int | pad, | ||
float * | data_col, | ||
int | bit_align | ||
) |
void im2col_cpu_custom_transpose | ( | float * | data_im, |
int | channels, | ||
int | height, | ||
int | width, | ||
int | ksize, | ||
int | stride, | ||
int | pad, | ||
float * | data_col, | ||
int | ldb_align | ||
) |
void init_cpu | ( | ) |
int is_avx | ( | ) |
int is_fma_avx2 | ( | ) |
float * random_matrix | ( | int | rows, |
int | cols | ||
) |
void repack_input | ( | float * | input, |
float * | re_packed_input, | ||
int | w, | ||
int | h, | ||
int | c | ||
) |
void test_gpu_accuracy | ( | int | TA, |
int | TB, | ||
int | m, | ||
int | k, | ||
int | n | ||
) |
int test_gpu_blas | ( | ) |
void time_gpu_random_matrix | ( | int | TA, |
int | TB, | ||
int | m, | ||
int | k, | ||
int | n | ||
) |
void time_ongpu | ( | int | TA, |
int | TB, | ||
int | m, | ||
int | k, | ||
int | n | ||
) |
void time_random_matrix | ( | int | TA, |
int | TB, | ||
int | m, | ||
int | k, | ||
int | n | ||
) |
void transpose_bin | ( | uint32_t * | A, |
uint32_t * | B, | ||
const int | n, | ||
const int | m, | ||
const int | lda, | ||
const int | ldb, | ||
const int | block_size | ||
) |
void transpose_block_SSE4x4 | ( | float * | A, |
float * | B, | ||
const int | n, | ||
const int | m, | ||
const int | lda, | ||
const int | ldb, | ||
const int | block_size | ||
) |
|
inlinestatic |
void transpose_uint32 | ( | uint32_t * | src, |
uint32_t * | dst, | ||
int | src_h, | ||
int | src_w, | ||
int | src_align, | ||
int | dst_align | ||
) |
|
inlinestatic |
|
inlinestatic |
|
inlinestatic |