General matrix multiplication (GEMM) More...

Include dependency graph for gemm.cpp:

Macros
#define	PUT_IN_REGISTER register

#define	swap(a0, a1, j, m) t = (a0 ^ (a1 >>j)) & m; a0 = a0 ^ t; a1 = a1 ^ (t << j);

#define	TILE_K 16

#define	TILE_M 4

#define	TILE_N 16

Functions
void	activate_array_cpu_custom (float *x, const int n, const ACTIVATION a)

void	convolution_2d (int w, int h, int ksize, int n, int c, int pad, int stride, float weights, float input, float output, float mean)

void	convolution_repacked (uint32_t packed_input, uint32_t packed_weights, float output, int w, int h, int c, int n, int size, int pad, int new_lda, float mean_arr)

void	float_to_bit (float src, unsigned char dst, size_t size)

void	forward_maxpool_layer_avx (float src, float dst, int *indexes, int size, int w, int h, int out_w, int out_h, int c, int pad, int stride, int batch)

void	gemm_cpu (int TA, int TB, int M, int N, int K, float ALPHA, float A, int lda, float B, int ldb, float BETA, float *C, int ldc)

void	gemm_nn (int M, int N, int K, float ALPHA, float A, int lda, float B, int ldb, float *C, int ldc)

void	gemm_nn_bin_32bit_packed (int M, int N, int K, float ALPHA, uint32_t A, int lda, uint32_t B, int ldb, float C, int ldc, float mean_arr)

void	gemm_nn_bin_transposed_32bit_packed (int M, int N, int K, float ALPHA, uint32_t A, int lda, uint32_t B, int ldb, float C, int ldc, float mean_arr)

void	gemm_nn_custom_bin_mean_transposed (int M, int N, int K, float ALPHA_UNUSED, unsigned char A, int lda, unsigned char B, int ldb, float C, int ldc, float mean_arr)

void	gemm_nn_fast (int M, int N, int K, float ALPHA, float A, int lda, float B, int ldb, float *C, int ldc)

void	gemm_nt (int M, int N, int K, float ALPHA, float A, int lda, float B, int ldb, float *C, int ldc)

void	gemm_tn (int M, int N, int K, float ALPHA, float A, int lda, float B, int ldb, float *C, int ldc)

void	gemm_tt (int M, int N, int K, float ALPHA, float A, int lda, float B, int ldb, float *C, int ldc)

void	im2col_cpu_custom (float data_im, int channels, int height, int width, int ksize, int stride, int pad, float data_col)

void	im2col_cpu_custom_bin (float data_im, int channels, int height, int width, int ksize, int stride, int pad, float data_col, int bit_align)

void	im2col_cpu_custom_transpose (float data_im, int channels, int height, int width, int ksize, int stride, int pad, float data_col, int ldb_align)

void	init_cpu ()

int	is_avx ()

int	is_fma_avx2 ()

float *	random_matrix (int rows, int cols)

void	repack_input (float input, float re_packed_input, int w, int h, int c)

uint32_t	reverse_32_bit (uint32_t a)

uint8_t	reverse_8_bit (uint8_t a)

unsigned char	reverse_byte (unsigned char a)

void	transpose32_optimized (uint32_t A[32])

void	transpose8rS32_reversed_diagonale (unsigned char A, unsigned char B, int m, int n)

void	transpose_32x32_bits_reversed_diagonale (uint32_t A, uint32_t B, int m, int n)

void	transpose_bin (uint32_t A, uint32_t B, const int n, const int m, const int lda, const int ldb, const int block_size)

void	transpose_block_SSE4x4 (float A, float B, const int n, const int m, const int lda, const int ldb, const int block_size)

static void	transpose_scalar_block (float A, float B, const int lda, const int ldb, const int block_size)

void	transpose_uint32 (uint32_t src, uint32_t dst, int src_h, int src_w, int src_align, int dst_align)

Detailed Description

General matrix multiplication (GEMM)

Macro Definition Documentation

◆ PUT_IN_REGISTER

#define PUT_IN_REGISTER register

◆ swap

#define swap	(	a0,
		a1,
		j,
		m
	)	t = (a0 ^ (a1 >>j)) & m; a0 = a0 ^ t; a1 = a1 ^ (t << j);

◆ TILE_K

#define TILE_K 16

◆ TILE_M

#define TILE_M 4

Todo:: V3 Would be nice to know where this file came from, and to see if there are updates available.

◆ TILE_N

#define TILE_N 16

Function Documentation

◆ activate_array_cpu_custom()

void activate_array_cpu_custom	(	float *	x,
		const int	n,
		const ACTIVATION	a
	)

Here is the call graph for this function:

Here is the caller graph for this function:

◆ convolution_2d()

void convolution_2d	(	int	w,
		int	h,
		int	ksize,
		int	n,
		int	c,
		int	pad,
		int	stride,
		float *	weights,
		float *	input,
		float *	output,
		float *	mean
	)

◆ convolution_repacked()

void convolution_repacked	(	uint32_t *	packed_input,
		uint32_t *	packed_weights,
		float *	output,
		int	w,
		int	h,
		int	c,
		int	n,
		int	size,
		int	pad,
		int	new_lda,
		float *	mean_arr
	)

◆ float_to_bit()

void float_to_bit	(	float *	src,
		unsigned char *	dst,
		size_t	size
	)

Here is the caller graph for this function:

◆ forward_maxpool_layer_avx()

void forward_maxpool_layer_avx	(	float *	src,
		float *	dst,
		int *	indexes,
		int	size,
		int	w,
		int	h,
		int	out_w,
		int	out_h,
		int	c,
		int	pad,
		int	stride,
		int	batch
	)

Here is the caller graph for this function:

◆ gemm_cpu()

void gemm_cpu	(	int	TA,
		int	TB,
		int	M,
		int	N,
		int	K,
		float	ALPHA,
		float *	A,
		int	lda,
		float *	B,
		int	ldb,
		float	BETA,
		float *	C,
		int	ldc
	)

Here is the call graph for this function:

Here is the caller graph for this function:

◆ gemm_nn()

void gemm_nn	(	int	M,
		int	N,
		int	K,
		float	ALPHA,
		float *	A,
		int	lda,
		float *	B,
		int	ldb,
		float *	C,
		int	ldc
	)

Here is the caller graph for this function:

◆ gemm_nn_bin_32bit_packed()

void gemm_nn_bin_32bit_packed	(	int	M,
		int	N,
		int	K,
		float	ALPHA,
		uint32_t *	A,
		int	lda,
		uint32_t *	B,
		int	ldb,
		float *	C,
		int	ldc,
		float *	mean_arr
	)

◆ gemm_nn_bin_transposed_32bit_packed()

void gemm_nn_bin_transposed_32bit_packed	(	int	M,
		int	N,
		int	K,
		float	ALPHA,
		uint32_t *	A,
		int	lda,
		uint32_t *	B,
		int	ldb,
		float *	C,
		int	ldc,
		float *	mean_arr
	)

◆ gemm_nn_custom_bin_mean_transposed()

void gemm_nn_custom_bin_mean_transposed	(	int	M,
		int	N,
		int	K,
		float	ALPHA_UNUSED,
		unsigned char *	A,
		int	lda,
		unsigned char *	B,
		int	ldb,
		float *	C,
		int	ldc,
		float *	mean_arr
	)

Here is the caller graph for this function:

◆ gemm_nn_fast()

void gemm_nn_fast	(	int	M,
		int	N,
		int	K,
		float	ALPHA,
		float *	A,
		int	lda,
		float *	B,
		int	ldb,
		float *	C,
		int	ldc
	)

Here is the caller graph for this function:

◆ gemm_nt()

void gemm_nt	(	int	M,
		int	N,
		int	K,
		float	ALPHA,
		float *	A,
		int	lda,
		float *	B,
		int	ldb,
		float *	C,
		int	ldc
	)

Here is the caller graph for this function:

◆ gemm_tn()

void gemm_tn	(	int	M,
		int	N,
		int	K,
		float	ALPHA,
		float *	A,
		int	lda,
		float *	B,
		int	ldb,
		float *	C,
		int	ldc
	)

Here is the caller graph for this function:

◆ gemm_tt()

void gemm_tt	(	int	M,
		int	N,
		int	K,
		float	ALPHA,
		float *	A,
		int	lda,
		float *	B,
		int	ldb,
		float *	C,
		int	ldc
	)

Here is the caller graph for this function:

◆ im2col_cpu_custom()

void im2col_cpu_custom	(	float *	data_im,
		int	channels,
		int	height,
		int	width,
		int	ksize,
		int	stride,
		int	pad,
		float *	data_col
	)

Here is the call graph for this function:

Here is the caller graph for this function:

◆ im2col_cpu_custom_bin()

void im2col_cpu_custom_bin	(	float *	data_im,
		int	channels,
		int	height,
		int	width,
		int	ksize,
		int	stride,
		int	pad,
		float *	data_col,
		int	bit_align
	)

Here is the call graph for this function:

Here is the caller graph for this function:

◆ im2col_cpu_custom_transpose()

void im2col_cpu_custom_transpose	(	float *	data_im,
		int	channels,
		int	height,
		int	width,
		int	ksize,
		int	stride,
		int	pad,
		float *	data_col,
		int	ldb_align
	)

◆ init_cpu()

void init_cpu ( )

Here is the call graph for this function:

Here is the caller graph for this function:

◆ is_avx()

int is_avx ( )

Here is the caller graph for this function:

◆ is_fma_avx2()

int is_fma_avx2 ( )

Here is the caller graph for this function:

◆ random_matrix()

float * random_matrix	(	int	rows,
		int	cols
	)

◆ repack_input()

void repack_input	(	float *	input,
		float *	re_packed_input,
		int	w,
		int	h,
		int	c
	)

Here is the caller graph for this function:

◆ reverse_32_bit()

uint32_t reverse_32_bit ( uint32_t a )

Note: This function is for CPU-only versions of Darknet. See im2col_kernels.cu for GPU version.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ reverse_8_bit()

uint8_t reverse_8_bit ( uint8_t a )

Note: This function is for CPU-only versions of Darknet. See im2col_kernels.cu for GPU version.

Here is the caller graph for this function:

◆ reverse_byte()

unsigned char reverse_byte ( unsigned char a )

Note: This function is for CPU-only versions of Darknet.

Here is the caller graph for this function:

◆ transpose32_optimized()

void transpose32_optimized ( uint32_t A[32] )

Note: This function is for CPU-only versions of Darknet. See im2col_kernels.cu for GPU version.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ transpose8rS32_reversed_diagonale()

void transpose8rS32_reversed_diagonale	(	unsigned char *	A,
		unsigned char *	B,
		int	m,
		int	n
	)

Note: This function is for CPU-only versions of Darknet. See im2col_kernels.cu for GPU version.

Here is the call graph for this function:

◆ transpose_32x32_bits_reversed_diagonale()

void transpose_32x32_bits_reversed_diagonale	(	uint32_t *	A,
		uint32_t *	B,
		int	m,
		int	n
	)

Note: This function is for CPU-only versions of Darknet. See im2col_kernels.cu for GPU version.

Here is the call graph for this function:

Here is the caller graph for this function:

◆ transpose_bin()

void transpose_bin	(	uint32_t *	A,
		uint32_t *	B,
		const int	n,
		const int	m,
		const int	lda,
		const int	ldb,
		const int	block_size
	)

Note: This function is for CPU-only versions of Darknet. See im2col_kernels.cu for GPU version.

Todo:: V3 this will never run...right? Isn't "j" always going to be >= "m" by the time we get here?

Here is the call graph for this function:

◆ transpose_block_SSE4x4()

void transpose_block_SSE4x4	(	float *	A,
		float *	B,
		const int	n,
		const int	m,
		const int	lda,
		const int	ldb,
		const int	block_size
	)

◆ transpose_scalar_block()

static void transpose_scalar_block	(	float *	A,
		float *	B,
		const int	lda,
		const int	ldb,
		const int	block_size
	)

inlinestatic

◆ transpose_uint32()

void transpose_uint32	(	uint32_t *	src,
		uint32_t *	dst,
		int	src_h,
		int	src_w,
		int	src_align,
		int	dst_align
	)

Here is the caller graph for this function:

Macros

Functions

Detailed Description

Macro Definition Documentation

◆ PUT_IN_REGISTER

◆ swap

◆ TILE_K

◆ TILE_M

◆ TILE_N

Function Documentation

◆ activate_array_cpu_custom()

◆ convolution_2d()

◆ convolution_repacked()

◆ float_to_bit()

◆ forward_maxpool_layer_avx()

◆ gemm_cpu()

◆ gemm_nn()

◆ gemm_nn_bin_32bit_packed()

◆ gemm_nn_bin_transposed_32bit_packed()

◆ gemm_nn_custom_bin_mean_transposed()

◆ gemm_nn_fast()

◆ gemm_nt()

◆ gemm_tn()

◆ gemm_tt()

◆ im2col_cpu_custom()

◆ im2col_cpu_custom_bin()

◆ im2col_cpu_custom_transpose()

◆ init_cpu()

◆ is_avx()

◆ is_fma_avx2()

◆ random_matrix()

◆ repack_input()

◆ reverse_32_bit()

◆ reverse_8_bit()

◆ reverse_byte()

◆ transpose32_optimized()

◆ transpose8rS32_reversed_diagonale()

◆ transpose_32x32_bits_reversed_diagonale()

◆ transpose_bin()

◆ transpose_block_SSE4x4()

◆ transpose_scalar_block()

◆ transpose_uint32()