Source code: https://github.com/lenguyen1807/mnist-cpp/tree/apple . Finally I can do something big ðŸ˜, this simple Neural Network (library or framework I don't know) can train LeNet5 (and hopefully AlexNet too).
NOTE: I just rewrite the whole source code (again), the old one looks really bad 💀 (update in 12-Jan-25). So this project took me 4 months 🥲.
TL;DR
How much improvement I get ?
The old version which took about 37 minutes to complete.
This is where I add BLAS optimization, got about 12 minutes
Got my first macbook so I need to rewirte my project for Apple. This is where I improve the code structure and add more layers (Convolution, Drop out) and M2 CPU is so crazy, it took only 2 minutes
Some details
1. Code structures
I want to mimic how Pytorch training (and testing) loop works so I used the same approach, I don't know it is the correct way or not 🥲 but it looks like this:
for ( const auto& img : trainset.dataset) {
// forward pass
auto logits = model. forward (img->data);
// calculate loss
float loss = loss_fn (logits, img->label);
train_loss += loss;
// calculate accuracy
int pred_label = loss_fn. get_pred (). arg_max ();
int true_label = img->label. arg_max ();
train_correct += (pred_label == true_label) ? 1.0 f : 0.0 f ;
// backward pass
model. backward (loss_fn. get_loss_grad ());
// update weight
optim. step ();
// zero all gradients for next iteration
model. zero_grad ();
}
Also trying to simulate a custom class for model in Pytorch:
// MLP.h
class MLP : public nn :: Module
{
public:
// same for __init__ in pytorch
MLP ( size_t input_size , size_t output_size );
// Now declare forward and backward function
nn :: matrix < float > forward ( const nn :: matrix < float > & input );
void backward ( const nn :: matrix < float > & grad );
};
// MLP.cpp
MLP :: MLP ( size_t input_size, size_t output_size)
{
/* Now add layer, using index to know identify each layer type */
add < 0 , nn :: Linear >(input_size, 500 );
add < 1 , nn :: ReLU >();
add < 2 , nn :: Linear >( 500 , 300 );
add < 3 , nn :: ReLU >();
add < 4 , nn :: Linear >( 300 , 100 );
add < 5 , nn :: ReLU >();
add < 6 , nn :: Linear >( 100 , output_size);
}
Then I use a abstract Layer
for all layers (and even activations):
class Layer
{
public:
virtual matrix < T > forward ( const matrix < T > & input ) = 0 ;
virtual matrix < T > backward ( const matrix < T > & grad ) = 0 ;
virtual void print_stats () {}
virtual void zero_grad () {}
virtual void accept_optimizer ( Optimizer * optim ) {}
virtual ~Layer () {};
protected:
matrix < T > m_input;
};
Here is an implementation of layer for Linear
layer:
Linear :: Linear ( size_t input_size, size_t output_size, bool rand_init, bool bias)
: m_b (output_size, 1 )
, m_W (output_size, input_size)
, is_bias (bias)
{
if (rand_init) {
// https://cs231n.github.io/neural-networks-2/#init
m_W = he_initialize (output_size, input_size);
:: print_stats (m_W, "weight_initialization" );
}
}
matrix < float > Linear :: forward ( const matrix < float > & input )
{
m_input = input;
if (is_bias)
return m_W * m_input + m_b;
else
return m_W * m_input;
}
matrix < float > Linear :: backward ( const matrix < float > & grad )
{
m_dx = m_W. t () * grad;
m_dW = grad * m_input. t ();
// We only update bias when we need it (bias = True)
// Else just keep it 0
if (is_bias) {
m_db = grad;
}
return m_dx;
}
Now the coolest thing is in Optimizer
, we want optimizer to access each layer and update weight for that layer but each layer has different set of weights (or biases) so I decide to use Visitor
pattern (learnt this in Crafting Interpreters ). An Optimizer
will be a visitor, then it will visit each layer, accept the layer and update the layer based on layer type.
void SGD :: visit_linear ( Linear * linear )
{
auto W = linear-> get_weight ();
auto dW = linear-> get_weightgrad ();
linear-> set_weight (W - (dW % m_lr));
if (linear->is_bias) {
auto b = linear-> get_bias ();
auto db = linear-> get_biasgrad ();
linear-> set_bias (b - (db % m_lr));
}
}
void Linear :: accept_optimizer ( Optimizer * optim )
{
optim-> visit_linear ( this );
}
void Optimizer :: step ()
{
// each layer will have a pointer to the model it's trying to optimize
for ( const auto& layer : m_model->layers) {
layer.second-> accept_optimizer ( this );
}
}
2. Some optimization
The first optimization is matrix multiplication (or you can call GEMM ), I use BLAS
from Apple Accelerate
framework (this optimization is cray as you can see in the result):
template < typename T >
inline matrix < T > operator * ( const matrix < T > & left , const matrix < T > & right )
{
assert (left.cols == right.rows);
size_t rows = left.rows;
size_t cols = right.cols;
size_t inners = right.rows; // can be left.cols
matrix < T > res (rows, cols);
#ifdef __APPLE__
/* Use Apple Accelerate framework to optimize */
__LAPACK_int blas_rows = (__LAPACK_int)rows;
__LAPACK_int blas_cols = (__LAPACK_int)cols;
__LAPACK_int blas_inners = (__LAPACK_int)inners;
// NOTE: I have to handle for multiple data type
// sgemm only works for float data type
cblas_sgemm (CblasRowMajor, CblasNoTrans, CblasNoTrans,
blas_rows, blas_cols,
blas_inners, 1.0 , left.data. data (),
blas_inners, right.data. data (), blas_cols,
0.0 , res.data. data (), blas_cols);
#else
/* Fall back to naive solution ... */
#endif
return res;
}
Next using some OpenMP
operation to improve some other functions such as:
T reduce_sum () const
{
T total {};
#pragma omp parallel for reduction (+ : total )
for ( size_t i = 0 ; i < rows * cols; i ++ ) {
total += data[i];
}
return total;
}
// generate random matrix based on normal distribution
static matrix < T > nrand ( size_t rows , size_t cols , T mu , T std )
{
matrix < T > res (rows, cols);
#pragma omp parallel
{
std ::random_device rand_dev;
std ::mt19937 generator ( rand_dev () + omp_get_thread_num ());
std ::normal_distribution < T > distr (mu, std);
#pragma omp for
for ( size_t i = 0 ; i < rows * cols; i ++ ) {
res.data[i] = distr (generator);
}
}
return res;
}
3. Implement Convolution Layer
This is the hardest part because we want to express convolution operation as a matrix multiplication (we want to utilize BLAS
for the best performance) but how we can do that ? We will use two operations called im2col
and col2im
, these functions will trasform an image to matrix then matrix to image. Below is the implement from Darknet (I just copy from them):
// Taken from this paper: https://arxiv.org/abs/1603.07285
std :: pair < size_t , size_t > Conv2D :: calculate_output_size ( size_t input_h ,
size_t input_w ,
ConvParams params )
{
size_t output_h =
(input_h + 2 * params.pad_h - params.ker_h) / params.stride_h + 1 ;
size_t output_w =
(input_w + 2 * params.pad_w - params.ker_w) / params.stride_w + 1 ;
return std :: make_pair (output_h, output_w);
}
matrix < float > Conv2D :: im2col ( const cv :: Mat & data_im ,
size_t kernel_h ,
size_t kernel_w ,
size_t stride_h ,
size_t stride_w ,
size_t pad_h ,
size_t pad_w )
{
size_t height_col, width_col;
std :: tie (height_col, width_col) = Conv2D :: calculate_output_size (
data_im.rows,
data_im.cols,
{kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w});
size_t channels_col = data_im. channels () * kernel_h * kernel_w;
matrix <float> res (channels_col, width_col * height_col);
for ( size_t c = 0 ; c < channels_col; ++ c) {
size_t w_offset = c % kernel_w;
size_t h_offset = (c / kernel_w) % kernel_h;
size_t c_im = c / kernel_h / kernel_w;
for ( size_t h = 0 ; h < height_col; ++ h) {
for ( size_t w = 0 ; w < width_col; ++ w) {
size_t h_im = h * stride_h + h_offset;
size_t w_im = w * stride_w + w_offset;
size_t input_idx = (c * height_col + h) * width_col + w;
res.data[input_idx] =
Conv2D :: get_pixel_im2col (data_im, h_im, w_im, pad_h, pad_w, c_im);
}
}
}
return res;
}
float Conv2D :: get_pixel_im2col ( const cv :: Mat & data_im ,
size_t h_im ,
size_t w_im ,
size_t pad_h ,
size_t pad_w ,
size_t channel )
{
// H padding and W padding can be negative
int h_pad = static_cast<int> (h_im) - static_cast<int> (pad_h);
int w_pad = static_cast<int> (w_im) - static_cast<int> (pad_w);
if (h_pad >= 0 && h_pad < data_im.rows && w_pad >= 0 && w_pad < data_im.cols)
{
if (data_im. channels () == 1 ) {
return data_im.at <float> (h_pad, w_pad);
} else {
return data_im.ptr <float> (h_pad, w_pad)[channel];
}
}
return 0. f ;
}
After you translate the image to column for convolution, just do a simple forward pass as below:
cv :: Mat Conv2D :: forward ( const cv :: Mat & input )
{
// store for backward pass
m_im = input. clone ();
// turn image to column matrix
m_input = Conv2D :: im2col (m_im,
m_params.ker_h,
m_params.ker_w,
m_params.stride_h,
m_params.stride_w,
m_params.pad_h,
m_params.pad_w);
// Calculate convolution operation as matrix multiplication
matrix <float> output = m_W * m_input;
// Calculate output size
size_t output_c = output.rows;
size_t output_h, output_w;
std :: tie (output_h, output_w) =
Conv2D :: calculate_output_size (input.rows, input.cols, m_params);
// Then reshape column back to image
return Conv2D :: reshape_mat2im (output, output_c, output_h, output_w);
}
cv :: Mat Conv2D :: reshape_mat2im ( const matrix < float > & im ,
size_t channels ,
size_t height ,
size_t width )
{
cv ::Mat output (height, width, CV_32FC (channels));
for ( size_t c = 0 ; c < channels; c ++ ) {
for ( size_t h = 0 ; h < height; h ++ ) {
for ( size_t w = 0 ; w < width; w ++ ) {
size_t idx = (c * height + h) * width + w;
if (channels == 1 ) {
output.at <float> (h, w) = im.data[idx];
} else {
output.ptr <float> (h, w)[c] = im.data[idx];
}
}
}
}
return output;
}
Really cool, just as simple as that. But to implement it correctly with backward pass and pooling, we have to be stronger 💀 (actually I implemented backward and pooling layer but it's so long and so hard to talk here, hope we can meet again). Before go to the end, below is the implementation of LeNet5:
LeNet5 :: LeNet5 ( size_t input_channels, size_t output_size)
{
add < 0 , nn :: Conv2D >(input_channels,
/*output_channels=*/ 6 ,
/*stride=*/ 1 ,
/*kernel_size=*/ 5 ,
/*padding=*/ 0 );
add < 1 , nn :: ReLU >();
add < 2 , nn :: AvgPool2D >( /*kernel_size=*/ 2 ,
/*stride=*/ 2 ,
/*padding=*/ 0 );
add < 3 , nn :: Conv2D >( /*input_channels=*/ 6 ,
/*output_channels=*/ 16 ,
/*stride=*/ 1 ,
/*kernel_size=*/ 5 ,
/*padding=*/ 0 );
add < 4 , nn :: ReLU >();
add < 5 , nn :: AvgPool2D >( /*kernel_size=*/ 2 ,
/*stride=*/ 2 ,
/*padding=*/ 0 );
add < 6 , nn :: Flatten >();
add < 7 , nn :: Linear >( 400 , 84 );
add < 8 , nn :: ReLU >();
add < 9 , nn :: Dropout >(); // avoid overfitting
add < 10 , nn :: Linear >( 84 , output_size);
}
nn :: matrix < float > LeNet5 :: forward ( const cv :: Mat & image )
{
// Feature extraction
cv ::Mat im = layers. at ( 0 )-> forward (image); // Conv2D
im = layers. at ( 1 )-> forward (im); // ReLU
im = layers. at ( 2 )-> forward (im); // AvgPool2D
im = layers. at ( 3 )-> forward (im); // Conv2D
im = layers. at ( 4 )-> forward (im); // ReLU
im = layers. at ( 5 )-> forward (im); // AvgPool2D
// Flatten an image to 1D vector
nn ::matrix <float> result =
dynamic_cast< nn ::Flatten *> (layers. at ( 6 ). get ())-> forward_im (im);
// Classifier
result = layers. at ( 7 )-> forward (result); // Linear
result = layers. at ( 8 )-> forward (result); // ReLU
result = layers. at ( 9 )-> forward (result); // Dropout
result = layers. at ( 10 )-> forward (result); // Linear
return result;
}
void LeNet5 :: backward ( const nn :: matrix < float > & grad )
{
nn ::matrix <float> result = layers. at ( 10 )-> backward (grad); // Linear
result = layers. at ( 9 )-> backward (result); // Dropout
result = layers. at ( 8 )-> backward (result); // ReLU
result = layers. at ( 7 )-> backward (result); // Linear
// backward 1D vector to image
cv ::Mat im =
dynamic_cast< nn ::Flatten *> (layers. at ( 6 ). get ())-> backward_im (result);
im = layers. at ( 5 )-> backward (im); // AvgPool2D
im = layers. at ( 4 )-> backward (im); // ReLU
im = layers. at ( 3 )-> backward (im); // Conv2D
im = layers. at ( 2 )-> backward (im); // AvgPool2D
im = layers. at ( 1 )-> backward (im); // ReLU
im = layers. at ( 0 )-> backward (im); // Conv2D
}
Finally I can finish a side project (not just drop it after few lines of code) ðŸ˜.