Darknet/YOLO v4.0-11-gbfab9ec
Object Detection Framework
 
Loading...
Searching...
No Matches
gemm.cpp File Reference

General matrix multiplication (GEMM) More...

Include dependency graph for gemm.cpp:

Macros

#define PUT_IN_REGISTER   register
 
#define swap(a0, a1, j, m)   t = (a0 ^ (a1 >>j)) & m; a0 = a0 ^ t; a1 = a1 ^ (t << j);
 
#define TILE_K   16
 
#define TILE_M   4
 
#define TILE_N   16
 

Functions

void activate_array_cpu_custom (float *x, const int n, const ACTIVATION a)
 
void convolution_2d (int w, int h, int ksize, int n, int c, int pad, int stride, float *weights, float *input, float *output, float *mean)
 
void convolution_repacked (uint32_t *packed_input, uint32_t *packed_weights, float *output, int w, int h, int c, int n, int size, int pad, int new_lda, float *mean_arr)
 
void float_to_bit (float *src, unsigned char *dst, size_t size)
 
void forward_maxpool_layer_avx (float *src, float *dst, int *indexes, int size, int w, int h, int out_w, int out_h, int c, int pad, int stride, int batch)
 
void gemm (int TA, int TB, int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float BETA, float *C, int ldc)
 
void gemm_cpu (int TA, int TB, int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float BETA, float *C, int ldc)
 
void gemm_nn (int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float *C, int ldc)
 
void gemm_nn_bin_32bit_packed (int M, int N, int K, float ALPHA, uint32_t *A, int lda, uint32_t *B, int ldb, float *C, int ldc, float *mean_arr)
 
void gemm_nn_bin_transposed_32bit_packed (int M, int N, int K, float ALPHA, uint32_t *A, int lda, uint32_t *B, int ldb, float *C, int ldc, float *mean_arr)
 
void gemm_nn_custom_bin_mean_transposed (int M, int N, int K, float ALPHA_UNUSED, unsigned char *A, int lda, unsigned char *B, int ldb, float *C, int ldc, float *mean_arr)
 
void gemm_nn_fast (int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float *C, int ldc)
 
void gemm_nt (int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float *C, int ldc)
 
void gemm_tn (int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float *C, int ldc)
 
void gemm_tt (int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float *C, int ldc)
 
void im2col_cpu_custom (float *data_im, int channels, int height, int width, int ksize, int stride, int pad, float *data_col)
 
void im2col_cpu_custom_bin (float *data_im, int channels, int height, int width, int ksize, int stride, int pad, float *data_col, int bit_align)
 
void im2col_cpu_custom_transpose (float *data_im, int channels, int height, int width, int ksize, int stride, int pad, float *data_col, int ldb_align)
 
void init_cpu ()
 
int is_avx ()
 
int is_fma_avx2 ()
 
float * random_matrix (int rows, int cols)
 
void repack_input (float *input, float *re_packed_input, int w, int h, int c)
 
uint32_t reverse_32_bit (uint32_t a)
 
uint8_t reverse_8_bit (uint8_t a)
 
unsigned char reverse_byte (unsigned char a)
 
void transpose32_optimized (uint32_t A[32])
 
void transpose8rS32_reversed_diagonale (unsigned char *A, unsigned char *B, int m, int n)
 
void transpose_32x32_bits_reversed_diagonale (uint32_t *A, uint32_t *B, int m, int n)
 
void transpose_bin (uint32_t *A, uint32_t *B, const int n, const int m, const int lda, const int ldb, const int block_size)
 
void transpose_block_SSE4x4 (float *A, float *B, const int n, const int m, const int lda, const int ldb, const int block_size)
 
static void transpose_scalar_block (float *A, float *B, const int lda, const int ldb, const int block_size)
 
void transpose_uint32 (uint32_t *src, uint32_t *dst, int src_h, int src_w, int src_align, int dst_align)
 

Detailed Description

General matrix multiplication (GEMM)

Macro Definition Documentation

◆ PUT_IN_REGISTER

#define PUT_IN_REGISTER   register

◆ swap

#define swap (   a0,
  a1,
  j,
 
)    t = (a0 ^ (a1 >>j)) & m; a0 = a0 ^ t; a1 = a1 ^ (t << j);

◆ TILE_K

#define TILE_K   16

◆ TILE_M

#define TILE_M   4
Todo:
V3 Would be nice to know where this file came from, and to see if there are updates available.

◆ TILE_N

#define TILE_N   16

Function Documentation

◆ activate_array_cpu_custom()

void activate_array_cpu_custom ( float *  x,
const int  n,
const ACTIVATION  a 
)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ convolution_2d()

void convolution_2d ( int  w,
int  h,
int  ksize,
int  n,
int  c,
int  pad,
int  stride,
float *  weights,
float *  input,
float *  output,
float *  mean 
)

◆ convolution_repacked()

void convolution_repacked ( uint32_t *  packed_input,
uint32_t *  packed_weights,
float *  output,
int  w,
int  h,
int  c,
int  n,
int  size,
int  pad,
int  new_lda,
float *  mean_arr 
)

◆ float_to_bit()

void float_to_bit ( float *  src,
unsigned char *  dst,
size_t  size 
)
Here is the caller graph for this function:

◆ forward_maxpool_layer_avx()

void forward_maxpool_layer_avx ( float *  src,
float *  dst,
int *  indexes,
int  size,
int  w,
int  h,
int  out_w,
int  out_h,
int  c,
int  pad,
int  stride,
int  batch 
)
Here is the caller graph for this function:

◆ gemm()

void gemm ( int  TA,
int  TB,
int  M,
int  N,
int  K,
float  ALPHA,
float *  A,
int  lda,
float *  B,
int  ldb,
float  BETA,
float *  C,
int  ldc 
)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ gemm_cpu()

void gemm_cpu ( int  TA,
int  TB,
int  M,
int  N,
int  K,
float  ALPHA,
float *  A,
int  lda,
float *  B,
int  ldb,
float  BETA,
float *  C,
int  ldc 
)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ gemm_nn()

void gemm_nn ( int  M,
int  N,
int  K,
float  ALPHA,
float *  A,
int  lda,
float *  B,
int  ldb,
float *  C,
int  ldc 
)
Here is the caller graph for this function:

◆ gemm_nn_bin_32bit_packed()

void gemm_nn_bin_32bit_packed ( int  M,
int  N,
int  K,
float  ALPHA,
uint32_t *  A,
int  lda,
uint32_t *  B,
int  ldb,
float *  C,
int  ldc,
float *  mean_arr 
)

◆ gemm_nn_bin_transposed_32bit_packed()

void gemm_nn_bin_transposed_32bit_packed ( int  M,
int  N,
int  K,
float  ALPHA,
uint32_t *  A,
int  lda,
uint32_t *  B,
int  ldb,
float *  C,
int  ldc,
float *  mean_arr 
)

◆ gemm_nn_custom_bin_mean_transposed()

void gemm_nn_custom_bin_mean_transposed ( int  M,
int  N,
int  K,
float  ALPHA_UNUSED,
unsigned char *  A,
int  lda,
unsigned char *  B,
int  ldb,
float *  C,
int  ldc,
float *  mean_arr 
)
Here is the caller graph for this function:

◆ gemm_nn_fast()

void gemm_nn_fast ( int  M,
int  N,
int  K,
float  ALPHA,
float *  A,
int  lda,
float *  B,
int  ldb,
float *  C,
int  ldc 
)
Here is the caller graph for this function:

◆ gemm_nt()

void gemm_nt ( int  M,
int  N,
int  K,
float  ALPHA,
float *  A,
int  lda,
float *  B,
int  ldb,
float *  C,
int  ldc 
)
Here is the caller graph for this function:

◆ gemm_tn()

void gemm_tn ( int  M,
int  N,
int  K,
float  ALPHA,
float *  A,
int  lda,
float *  B,
int  ldb,
float *  C,
int  ldc 
)
Here is the caller graph for this function:

◆ gemm_tt()

void gemm_tt ( int  M,
int  N,
int  K,
float  ALPHA,
float *  A,
int  lda,
float *  B,
int  ldb,
float *  C,
int  ldc 
)
Here is the caller graph for this function:

◆ im2col_cpu_custom()

void im2col_cpu_custom ( float *  data_im,
int  channels,
int  height,
int  width,
int  ksize,
int  stride,
int  pad,
float *  data_col 
)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ im2col_cpu_custom_bin()

void im2col_cpu_custom_bin ( float *  data_im,
int  channels,
int  height,
int  width,
int  ksize,
int  stride,
int  pad,
float *  data_col,
int  bit_align 
)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ im2col_cpu_custom_transpose()

void im2col_cpu_custom_transpose ( float *  data_im,
int  channels,
int  height,
int  width,
int  ksize,
int  stride,
int  pad,
float *  data_col,
int  ldb_align 
)

◆ init_cpu()

void init_cpu ( )
Here is the call graph for this function:
Here is the caller graph for this function:

◆ is_avx()

int is_avx ( )
Here is the caller graph for this function:

◆ is_fma_avx2()

int is_fma_avx2 ( )
Here is the caller graph for this function:

◆ random_matrix()

float * random_matrix ( int  rows,
int  cols 
)

◆ repack_input()

void repack_input ( float *  input,
float *  re_packed_input,
int  w,
int  h,
int  c 
)
Here is the caller graph for this function:

◆ reverse_32_bit()

uint32_t reverse_32_bit ( uint32_t  a)
Note
This function is for CPU-only versions of Darknet. See im2col_kernels.cu for GPU version.
Here is the call graph for this function:
Here is the caller graph for this function:

◆ reverse_8_bit()

uint8_t reverse_8_bit ( uint8_t  a)
Note
This function is for CPU-only versions of Darknet. See im2col_kernels.cu for GPU version.
Here is the caller graph for this function:

◆ reverse_byte()

unsigned char reverse_byte ( unsigned char  a)
Note
This function is for CPU-only versions of Darknet.
Here is the caller graph for this function:

◆ transpose32_optimized()

void transpose32_optimized ( uint32_t  A[32])
Note
This function is for CPU-only versions of Darknet. See im2col_kernels.cu for GPU version.
Here is the call graph for this function:
Here is the caller graph for this function:

◆ transpose8rS32_reversed_diagonale()

void transpose8rS32_reversed_diagonale ( unsigned char *  A,
unsigned char *  B,
int  m,
int  n 
)
Note
This function is for CPU-only versions of Darknet. See im2col_kernels.cu for GPU version.
Here is the call graph for this function:

◆ transpose_32x32_bits_reversed_diagonale()

void transpose_32x32_bits_reversed_diagonale ( uint32_t *  A,
uint32_t *  B,
int  m,
int  n 
)
Note
This function is for CPU-only versions of Darknet. See im2col_kernels.cu for GPU version.
Here is the call graph for this function:
Here is the caller graph for this function:

◆ transpose_bin()

void transpose_bin ( uint32_t *  A,
uint32_t *  B,
const int  n,
const int  m,
const int  lda,
const int  ldb,
const int  block_size 
)
Note
This function is for CPU-only versions of Darknet. See im2col_kernels.cu for GPU version.
Todo:
V3 this will never run...right? Isn't "j" always going to be >= "m" by the time we get here?
Here is the call graph for this function:

◆ transpose_block_SSE4x4()

void transpose_block_SSE4x4 ( float *  A,
float *  B,
const int  n,
const int  m,
const int  lda,
const int  ldb,
const int  block_size 
)

◆ transpose_scalar_block()

static void transpose_scalar_block ( float *  A,
float *  B,
const int  lda,
const int  ldb,
const int  block_size 
)
inlinestatic

◆ transpose_uint32()

void transpose_uint32 ( uint32_t *  src,
uint32_t *  dst,
int  src_h,
int  src_w,
int  src_align,
int  dst_align 
)
Here is the caller graph for this function: