Darknet/YOLO v3.0-208-g0b6f60f-dirty
Object Detection Framework
 
Loading...
Searching...
No Matches
im2col_kernels.cu File Reference

Macros

#define CUDA_KERNEL_LOOP(i, n)
 
#define swap(a0, a1, j, m)   t = (a0 ^ (a1 >>j)) & m; a0 = a0 ^ t; a1 = a1 ^ (t << j);
 

Functions

template<typename T >
__device__ uint32_t __ballot_custom (T val)
 
template<typename T1 , typename T2 >
__device__ T1 __shfl_custom (T1 val, T2 lane)
 
int CAFFE_GET_BLOCKS (const int N)
 
void convolve_bin_gpu (float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad, int new_lda, float *mean_arr_gpu)
 
__global__ void convolve_bin_gpu_kernel (float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad, int new_lda, float *mean_arr_gpu)
 
void convolve_gpu (float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad)
 
__global__ void convolve_gpu_kernel (float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad)
 
void fill_int8_gpu (unsigned char *src, unsigned char val, size_t size)
 
__global__ void fill_int8_gpu_kernel (unsigned char *src, unsigned char val, size_t size)
 
void float_to_bit_gpu (float *src, unsigned char *dst, size_t size)
 
__global__ void float_to_bit_gpu_kernel (float *src, unsigned char *dst, size_t size)
 
void gemm_nn_custom_bin_mean_transposed_gpu (int M, int N, int K, unsigned char *A, int lda, unsigned char *B, int ldb, float *C, int ldc, float *mean_arr, float *bias, int leaky_activation, float *shortcut_in_gpu, float *shortcut_out_gpu)
 
__global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel (int M, int N, int K, unsigned char *A, int lda, unsigned char *B, int ldb, float *C, int ldc, float *mean_arr, float *bias_arr, int leaky_activation, float *shortcut_in_gpu, float *shortcut_out_gpu)
 
__device__ static __host__ unsigned char get_bit (unsigned char const *const src, size_t index)
 
__global__ void im2col_align_bin_gpu_kernel (const int n, const float *data_im, const int height, const int width, const int ksize, const int channels, const int pad, const int stride, const int height_col, const int width_col, float *data_col, const int bit_align)
 
void im2col_align_bin_ongpu (float *im, int channels, int height, int width, int ksize, int stride, int pad, float *data_col, int bit_align)
 
__global__ void im2col_align_gpu_kernel (const int n, const float *data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, float *data_col, const int bit_align)
 
void im2col_align_ongpu (float *im, int channels, int height, int width, int ksize, int stride, int pad, float *data_col, int bit_align)
 
void im2col_gpu_ext (const float *data_im, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, float *data_col)
 
__global__ void im2col_gpu_kernel (const int n, const float *data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, float *data_col)
 
__global__ void im2col_gpu_kernel_ext (const int n, const float *data_im, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int height_col, const int width_col, float *data_col)
 
void im2col_ongpu (float *im, int channels, int height, int width, int ksize, int stride, int pad, float *data_col)
 
void repack_input_gpu (float *input, float *re_packed_input, int w, int h, int c)
 
void repack_input_gpu_bin (float *input, uint32_t *re_packed_input_bin, int w, int h, int c)
 
__global__ void repack_input_kernel (float *input, float *re_packed_input, int w, int h, int c)
 
__global__ void repack_input_kernel_2 (float *input, float *re_packed_input, int w, int h, int c)
 
__global__ void repack_input_kernel_bin (float *input, uint32_t *re_packed_input_bin, int w, int h, int c)
 
__device__ uint32_t reverse_32_bit (uint32_t a)
 
__device__ __host__ uint8_t reverse_8_bit (uint8_t a)
 
__device__ __host__ unsigned char reverse_byte (unsigned char a)
 
__device__ __host__ unsigned char reverse_byte_2 (unsigned char a)
 
__device__ unsigned char reverse_byte_CUDA (unsigned char a)
 
__device__ void transpose32_optimized (uint32_t A[32])
 
__device__ void transpose8rS32_reversed_diagonale (unsigned char *A, unsigned char *B, int m, int n)
 
__device__ void transpose_32x32_bits_reversed_diagonale (uint32_t *A, uint32_t *B, int m, int n)
 
void transpose_bin_gpu (unsigned char *A, unsigned char *B, const int n, const int m, const int lda, const int ldb, const int block_size)
 
__global__ void transpose_bin_gpu_kernel (unsigned char *A, unsigned char *B, const int n, const int m, const int lda, const int ldb, const int block_size)
 
__global__ void transpose_bin_gpu_kernel_32 (uint32_t *A, uint32_t *B, const int n, const int m, const int lda, const int ldb, const int block_size)
 
void transpose_uint32_gpu (uint32_t *src, uint32_t *dst, int src_h, int src_w, int src_align, int dst_align)
 
__global__ void transpose_uint32_kernel (uint32_t *src, uint32_t *dst, int src_h, int src_w, int src_align, int dst_align)
 
__global__ void transpose_uint32_kernel_2 (uint32_t *src, uint32_t *dst, int src_h, int src_w, int src_align, int dst_align)
 
__inline__ __device__ int warpAllReduceSum (int val)
 
__device__ static __host__ uint8_t xnor_bit1 (uint8_t a, uint8_t b)
 
__device__ static __host__ ulonglong4 xor_int256 (ulonglong4 a, ulonglong4 b)
 
__device__ static __host__ uint32_t xor_int32 (uint32_t a, uint32_t b)
 
__device__ static __host__ uint64_t xor_int64 (uint64_t a, uint64_t b)
 

Variables

const int CAFFE_CUDA_NUM_THREADS = 512
 

Macro Definition Documentation

◆ CUDA_KERNEL_LOOP

#define CUDA_KERNEL_LOOP (   i,
 
)
Value:
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)

◆ swap

#define swap (   a0,
  a1,
  j,
 
)    t = (a0 ^ (a1 >>j)) & m; a0 = a0 ^ t; a1 = a1 ^ (t << j);

Function Documentation

◆ __ballot_custom()

template<typename T >
__device__ uint32_t __ballot_custom ( val)
inline
Here is the caller graph for this function:

◆ __shfl_custom()

template<typename T1 , typename T2 >
__device__ T1 __shfl_custom ( T1  val,
T2  lane 
)
inline
Here is the caller graph for this function:

◆ CAFFE_GET_BLOCKS()

int CAFFE_GET_BLOCKS ( const int  N)
inline
Here is the caller graph for this function:

◆ convolve_bin_gpu()

void convolve_bin_gpu ( float *  input,
float *  weights,
float *  output,
int  in_w,
int  in_h,
int  in_c,
int  n,
int  size,
int  pad,
int  new_lda,
float *  mean_arr_gpu 
)
Here is the call graph for this function:

◆ convolve_bin_gpu_kernel()

__global__ void convolve_bin_gpu_kernel ( float *  input,
float *  weights,
float *  output,
int  in_w,
int  in_h,
int  in_c,
int  n,
int  size,
int  pad,
int  new_lda,
float *  mean_arr_gpu 
)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ convolve_gpu()

void convolve_gpu ( float *  input,
float *  weights,
float *  output,
int  in_w,
int  in_h,
int  in_c,
int  n,
int  size,
int  pad 
)
Here is the call graph for this function:

◆ convolve_gpu_kernel()

__global__ void convolve_gpu_kernel ( float *  input,
float *  weights,
float *  output,
int  in_w,
int  in_h,
int  in_c,
int  n,
int  size,
int  pad 
)
Here is the caller graph for this function:

◆ fill_int8_gpu()

void fill_int8_gpu ( unsigned char *  src,
unsigned char  val,
size_t  size 
)
Here is the call graph for this function:

◆ fill_int8_gpu_kernel()

__global__ void fill_int8_gpu_kernel ( unsigned char *  src,
unsigned char  val,
size_t  size 
)
Here is the caller graph for this function:

◆ float_to_bit_gpu()

void float_to_bit_gpu ( float *  src,
unsigned char *  dst,
size_t  size 
)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ float_to_bit_gpu_kernel()

__global__ void float_to_bit_gpu_kernel ( float *  src,
unsigned char *  dst,
size_t  size 
)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ gemm_nn_custom_bin_mean_transposed_gpu()

void gemm_nn_custom_bin_mean_transposed_gpu ( int  M,
int  N,
int  K,
unsigned char *  A,
int  lda,
unsigned char *  B,
int  ldb,
float *  C,
int  ldc,
float *  mean_arr,
float *  bias,
int  leaky_activation,
float *  shortcut_in_gpu,
float *  shortcut_out_gpu 
)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ gemm_nn_custom_bin_mean_transposed_gpu_kernel()

__global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel ( int  M,
int  N,
int  K,
unsigned char *  A,
int  lda,
unsigned char *  B,
int  ldb,
float *  C,
int  ldc,
float *  mean_arr,
float *  bias_arr,
int  leaky_activation,
float *  shortcut_in_gpu,
float *  shortcut_out_gpu 
)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ get_bit()

__device__ static __host__ unsigned char get_bit ( unsigned char const *const  src,
size_t  index 
)
inlinestatic
Here is the caller graph for this function:

◆ im2col_align_bin_gpu_kernel()

__global__ void im2col_align_bin_gpu_kernel ( const int  n,
const float *  data_im,
const int  height,
const int  width,
const int  ksize,
const int  channels,
const int  pad,
const int  stride,
const int  height_col,
const int  width_col,
float *  data_col,
const int  bit_align 
)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ im2col_align_bin_ongpu()

void im2col_align_bin_ongpu ( float *  im,
int  channels,
int  height,
int  width,
int  ksize,
int  stride,
int  pad,
float *  data_col,
int  bit_align 
)
Here is the call graph for this function:

◆ im2col_align_gpu_kernel()

__global__ void im2col_align_gpu_kernel ( const int  n,
const float *  data_im,
const int  height,
const int  width,
const int  ksize,
const int  pad,
const int  stride,
const int  height_col,
const int  width_col,
float *  data_col,
const int  bit_align 
)
Here is the caller graph for this function:

◆ im2col_align_ongpu()

void im2col_align_ongpu ( float *  im,
int  channels,
int  height,
int  width,
int  ksize,
int  stride,
int  pad,
float *  data_col,
int  bit_align 
)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ im2col_gpu_ext()

void im2col_gpu_ext ( const float *  data_im,
const int  channels,
const int  height,
const int  width,
const int  kernel_h,
const int  kernel_w,
const int  pad_h,
const int  pad_w,
const int  stride_h,
const int  stride_w,
const int  dilation_h,
const int  dilation_w,
float *  data_col 
)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ im2col_gpu_kernel()

__global__ void im2col_gpu_kernel ( const int  n,
const float *  data_im,
const int  height,
const int  width,
const int  ksize,
const int  pad,
const int  stride,
const int  height_col,
const int  width_col,
float *  data_col 
)
Here is the caller graph for this function:

◆ im2col_gpu_kernel_ext()

__global__ void im2col_gpu_kernel_ext ( const int  n,
const float *  data_im,
const int  height,
const int  width,
const int  kernel_h,
const int  kernel_w,
const int  pad_h,
const int  pad_w,
const int  stride_h,
const int  stride_w,
const int  dilation_h,
const int  dilation_w,
const int  height_col,
const int  width_col,
float *  data_col 
)
Todo:
V3 what "index" is this referencing? How does this compile?
Here is the caller graph for this function:

◆ im2col_ongpu()

void im2col_ongpu ( float *  im,
int  channels,
int  height,
int  width,
int  ksize,
int  stride,
int  pad,
float *  data_col 
)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ repack_input_gpu()

void repack_input_gpu ( float *  input,
float *  re_packed_input,
int  w,
int  h,
int  c 
)
Here is the call graph for this function:

◆ repack_input_gpu_bin()

void repack_input_gpu_bin ( float *  input,
uint32_t *  re_packed_input_bin,
int  w,
int  h,
int  c 
)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ repack_input_kernel()

__global__ void repack_input_kernel ( float *  input,
float *  re_packed_input,
int  w,
int  h,
int  c 
)
Here is the caller graph for this function:

◆ repack_input_kernel_2()

__global__ void repack_input_kernel_2 ( float *  input,
float *  re_packed_input,
int  w,
int  h,
int  c 
)

◆ repack_input_kernel_bin()

__global__ void repack_input_kernel_bin ( float *  input,
uint32_t *  re_packed_input_bin,
int  w,
int  h,
int  c 
)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ reverse_32_bit()

__device__ uint32_t reverse_32_bit ( uint32_t  a)
Here is the caller graph for this function:

◆ reverse_8_bit()

__device__ __host__ uint8_t reverse_8_bit ( uint8_t  a)

◆ reverse_byte()

__device__ __host__ unsigned char reverse_byte ( unsigned char  a)

◆ reverse_byte_2()

__device__ __host__ unsigned char reverse_byte_2 ( unsigned char  a)

◆ reverse_byte_CUDA()

__device__ unsigned char reverse_byte_CUDA ( unsigned char  a)
Here is the caller graph for this function:

◆ transpose32_optimized()

__device__ void transpose32_optimized ( uint32_t  A[32])
Here is the call graph for this function:
Here is the caller graph for this function:

◆ transpose8rS32_reversed_diagonale()

__device__ void transpose8rS32_reversed_diagonale ( unsigned char *  A,
unsigned char *  B,
int  m,
int  n 
)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ transpose_32x32_bits_reversed_diagonale()

__device__ void transpose_32x32_bits_reversed_diagonale ( uint32_t *  A,
uint32_t *  B,
int  m,
int  n 
)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ transpose_bin_gpu()

void transpose_bin_gpu ( unsigned char *  A,
unsigned char *  B,
const int  n,
const int  m,
const int  lda,
const int  ldb,
const int  block_size 
)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ transpose_bin_gpu_kernel()

__global__ void transpose_bin_gpu_kernel ( unsigned char *  A,
unsigned char *  B,
const int  n,
const int  m,
const int  lda,
const int  ldb,
const int  block_size 
)
Here is the call graph for this function:

◆ transpose_bin_gpu_kernel_32()

__global__ void transpose_bin_gpu_kernel_32 ( uint32_t *  A,
uint32_t *  B,
const int  n,
const int  m,
const int  lda,
const int  ldb,
const int  block_size 
)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ transpose_uint32_gpu()

void transpose_uint32_gpu ( uint32_t *  src,
uint32_t *  dst,
int  src_h,
int  src_w,
int  src_align,
int  dst_align 
)
Here is the call graph for this function:
Here is the caller graph for this function:

◆ transpose_uint32_kernel()

__global__ void transpose_uint32_kernel ( uint32_t *  src,
uint32_t *  dst,
int  src_h,
int  src_w,
int  src_align,
int  dst_align 
)
Here is the caller graph for this function:

◆ transpose_uint32_kernel_2()

__global__ void transpose_uint32_kernel_2 ( uint32_t *  src,
uint32_t *  dst,
int  src_h,
int  src_w,
int  src_align,
int  dst_align 
)

◆ warpAllReduceSum()

__inline__ __device__ int warpAllReduceSum ( int  val)
Here is the caller graph for this function:

◆ xnor_bit1()

__device__ static __host__ uint8_t xnor_bit1 ( uint8_t  a,
uint8_t  b 
)
inlinestatic
Here is the caller graph for this function:

◆ xor_int256()

__device__ static __host__ ulonglong4 xor_int256 ( ulonglong4  a,
ulonglong4  b 
)
inlinestatic
Here is the caller graph for this function:

◆ xor_int32()

__device__ static __host__ uint32_t xor_int32 ( uint32_t  a,
uint32_t  b 
)
inlinestatic
Here is the caller graph for this function:

◆ xor_int64()

__device__ static __host__ uint64_t xor_int64 ( uint64_t  a,
uint64_t  b 
)
inlinestatic
Here is the caller graph for this function:

Variable Documentation

◆ CAFFE_CUDA_NUM_THREADS

const int CAFFE_CUDA_NUM_THREADS = 512