Simple Neural Network in C++

Source code: https://github.com/lenguyen1807/mnist-cpp/tree/apple. Finally I can do something big 😭, this simple Neural Network (library or framework I don't know) can train LeNet5 (and hopefully AlexNet too).

NOTE: I just rewrite the whole source code (again), the old one looks really bad 💀 (update in 12-Jan-25). So this project took me 4 months 🥲.

TL;DR

How much improvement I get ?

The old version which took about 37 minutes to complete.

This is where I add BLAS optimization, got about 12 minutes

Got my first macbook so I need to rewirte my project for Apple. This is where I improve the code structure and add more layers (Convolution, Drop out) and M2 CPU is so crazy, it took only 2 minutes

Some details

1. Code structures

I want to mimic how Pytorch training (and testing) loop works so I used the same approach, I don't know it is the correct way or not 🥲 but it looks like this:

for (const auto& img : trainset.dataset) {
  // forward pass
  auto logits = model.forward(img->data);
 
  // calculate loss
  float loss = loss_fn(logits, img->label);
  train_loss += loss;
 
  // calculate accuracy
  int pred_label = loss_fn.get_pred().arg_max();
  int true_label = img->label.arg_max();
  train_correct += (pred_label == true_label) ? 1.0f : 0.0f;
 
  // backward pass
  model.backward(loss_fn.get_loss_grad());
 
  // update weight
  optim.step();
 
  // zero all gradients for next iteration
  model.zero_grad();
}

Also trying to simulate a custom class for model in Pytorch:

// MLP.h
class MLP : public nn::Module
{
public:
  // same for __init__ in pytorch
  MLP(size_t input_size, size_t output_size);
 
  // Now declare forward and backward function
  nn::matrix<float> forward(const nn::matrix<float>& input);
  void backward(const nn::matrix<float>& grad);
};
 
// MLP.cpp
MLP::MLP(size_t input_size, size_t output_size)
{
  /* Now add layer, using index to know identify each layer type */
  add<0, nn::Linear>(input_size, 500);
  add<1, nn::ReLU>();
  add<2, nn::Linear>(500, 300);
  add<3, nn::ReLU>();
  add<4, nn::Linear>(300, 100);
  add<5, nn::ReLU>();
  add<6, nn::Linear>(100, output_size);
}

Then I use a abstract Layer for all layers (and even activations):

class Layer
{
public:
  virtual matrix<T> forward(const matrix<T>& input) = 0;
  virtual matrix<T> backward(const matrix<T>& grad) = 0;
  virtual void print_stats() {}
  virtual void zero_grad() {}
  virtual void accept_optimizer(Optimizer* optim) {}
  virtual ~Layer() {};
 
protected:
  matrix<T> m_input;
};

Here is an implementation of layer for Linear layer:

Linear::Linear(size_t input_size, size_t output_size, bool rand_init, bool bias)
    : m_b(output_size, 1)
    , m_W(output_size, input_size)
    , is_bias(bias)
{
  if (rand_init) {
    // https://cs231n.github.io/neural-networks-2/#init
    m_W = he_initialize(output_size, input_size);
    ::print_stats(m_W, "weight_initialization");
  }
}
 
matrix<float> Linear::forward(const matrix<float>& input)
{
  m_input = input;
  if (is_bias)
    return m_W * m_input + m_b;
  else
    return m_W * m_input;
}
 
matrix<float> Linear::backward(const matrix<float>& grad)
{
  m_dx = m_W.t() * grad;
  m_dW = grad * m_input.t();
 
  // We only update bias when we need it (bias = True)
  // Else just keep it 0
  if (is_bias) {
    m_db = grad;
  }
 
  return m_dx;
}

Now the coolest thing is in Optimizer, we want optimizer to access each layer and update weight for that layer but each layer has different set of weights (or biases) so I decide to use Visitor pattern (learnt this in Crafting Interpreters). An Optimizer will be a visitor, then it will visit each layer, accept the layer and update the layer based on layer type.

void SGD::visit_linear(Linear* linear)
{
  auto W = linear->get_weight();
  auto dW = linear->get_weightgrad();
  linear->set_weight(W - (dW % m_lr));
 
  if (linear->is_bias) {
    auto b = linear->get_bias();
    auto db = linear->get_biasgrad();
    linear->set_bias(b - (db % m_lr));
  }
}
 
void Linear::accept_optimizer(Optimizer* optim)
{
  optim->visit_linear(this);
}
 
void Optimizer::step()
{
  // each layer will have a pointer to the model it's trying to optimize
  for (const auto& layer : m_model->layers) {
    layer.second->accept_optimizer(this);
  }
}

2. Some optimization

The first optimization is matrix multiplication (or you can call GEMM), I use BLAS from Apple Accelerate framework (this optimization is cray as you can see in the result):

template<typename T>
inline matrix<T> operator*(const matrix<T>& left, const matrix<T>& right)
{
  assert(left.cols == right.rows);
 
  size_t rows = left.rows;
  size_t cols = right.cols;
  size_t inners = right.rows;  // can be left.cols
 
  matrix<T> res(rows, cols);
 
#ifdef __APPLE__
  /* Use Apple Accelerate framework to optimize */
  __LAPACK_int blas_rows = (__LAPACK_int)rows;
  __LAPACK_int blas_cols = (__LAPACK_int)cols;
  __LAPACK_int blas_inners = (__LAPACK_int)inners;
 
  // NOTE: I have to handle for multiple data type
  // sgemm only works for float data type
  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
              blas_rows, blas_cols, 
              blas_inners, 1.0, left.data.data(), 
              blas_inners, right.data.data(), blas_cols, 
              0.0, res.data.data(), blas_cols);
#else
  /* Fall back to naive solution ... */
#endif
 
  return res;
}

Next using some OpenMP operation to improve some other functions such as:

T reduce_sum() const
{
  T total {};
  #pragma omp parallel for reduction(+ : total)
  for (size_t i = 0; i < rows * cols; i++) {
    total += data[i];
  }
  return total;
}
 
// generate random matrix based on normal distribution
static matrix<T> nrand(size_t rows, size_t cols, T mu, T std)
{
  matrix<T> res(rows, cols);
  #pragma omp parallel
  {
    std::random_device rand_dev;
    std::mt19937 generator(rand_dev() + omp_get_thread_num());
    std::normal_distribution<T> distr(mu, std);
 
    #pragma omp for
    for (size_t i = 0; i < rows * cols; i++) {
      res.data[i] = distr(generator);
    }
  }
  return res;
}

3. Implement Convolution Layer

This is the hardest part because we want to express convolution operation as a matrix multiplication (we want to utilize BLAS for the best performance) but how we can do that ? We will use two operations called im2col and col2im, these functions will trasform an image to matrix then matrix to image. Below is the implement from Darknet (I just copy from them):

// Taken from this paper: https://arxiv.org/abs/1603.07285
std::pair<size_t, size_t> Conv2D::calculate_output_size(size_t input_h,
                                                        size_t input_w,
                                                        ConvParams params)
{
  size_t output_h =
      (input_h + 2 * params.pad_h - params.ker_h) / params.stride_h + 1;
  size_t output_w =
      (input_w + 2 * params.pad_w - params.ker_w) / params.stride_w + 1;
  return std::make_pair(output_h, output_w);
}
 
matrix<float> Conv2D::im2col(const cv::Mat& data_im,
                             size_t kernel_h,
                             size_t kernel_w,
                             size_t stride_h,
                             size_t stride_w,
                             size_t pad_h,
                             size_t pad_w)
{
  size_t height_col, width_col;
  std::tie(height_col, width_col) = Conv2D::calculate_output_size(
      data_im.rows,
      data_im.cols,
      {kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w});
  size_t channels_col = data_im.channels() * kernel_h * kernel_w;
 
  matrix<float> res(channels_col, width_col * height_col);
 
  for (size_t c = 0; c < channels_col; ++c) {
    size_t w_offset = c % kernel_w;
    size_t h_offset = (c / kernel_w) % kernel_h;
    size_t c_im = c / kernel_h / kernel_w;
    for (size_t h = 0; h < height_col; ++h) {
      for (size_t w = 0; w < width_col; ++w) {
        size_t h_im = h * stride_h + h_offset;
        size_t w_im = w * stride_w + w_offset;
        size_t input_idx = (c * height_col + h) * width_col + w;
        res.data[input_idx] =
            Conv2D::get_pixel_im2col(data_im, h_im, w_im, pad_h, pad_w, c_im);
      }
    }
  }
 
  return res;
}
 
float Conv2D::get_pixel_im2col(const cv::Mat& data_im,
                               size_t h_im,
                               size_t w_im,
                               size_t pad_h,
                               size_t pad_w,
                               size_t channel)
{
  // H padding and W padding can be negative
  int h_pad = static_cast<int>(h_im) - static_cast<int>(pad_h);
  int w_pad = static_cast<int>(w_im) - static_cast<int>(pad_w);
 
  if (h_pad >= 0 && h_pad < data_im.rows && w_pad >= 0 && w_pad < data_im.cols)
  {
    if (data_im.channels() == 1) {
      return data_im.at<float>(h_pad, w_pad);
    } else {
      return data_im.ptr<float>(h_pad, w_pad)[channel];
    }
  }
 
  return 0.f;
}

After you translate the image to column for convolution, just do a simple forward pass as below:

cv::Mat Conv2D::forward(const cv::Mat& input)
{
  // store for backward pass
  m_im = input.clone();
 
  // turn image to column matrix
  m_input = Conv2D::im2col(m_im,
                           m_params.ker_h,
                           m_params.ker_w,
                           m_params.stride_h,
                           m_params.stride_w,
                           m_params.pad_h,
                           m_params.pad_w);
 
  // Calculate convolution operation as matrix multiplication
  matrix<float> output = m_W * m_input;
 
  // Calculate output size
  size_t output_c = output.rows;
  size_t output_h, output_w;
  std::tie(output_h, output_w) =
      Conv2D::calculate_output_size(input.rows, input.cols, m_params);
 
  // Then reshape column back to image
  return Conv2D::reshape_mat2im(output, output_c, output_h, output_w);
}
 
cv::Mat Conv2D::reshape_mat2im(const matrix<float>& im,
                               size_t channels,
                               size_t height,
                               size_t width)
{
  cv::Mat output(height, width, CV_32FC(channels));
  for (size_t c = 0; c < channels; c++) {
    for (size_t h = 0; h < height; h++) {
      for (size_t w = 0; w < width; w++) {
        size_t idx = (c * height + h) * width + w;
        if (channels == 1) {
          output.at<float>(h, w) = im.data[idx];
        } else {
          output.ptr<float>(h, w)[c] = im.data[idx];
        }
      }
    }
  }
 
  return output;
}

Really cool, just as simple as that. But to implement it correctly with backward pass and pooling, we have to be stronger 💀 (actually I implemented backward and pooling layer but it's so long and so hard to talk here, hope we can meet again). Before go to the end, below is the implementation of LeNet5:

LeNet5::LeNet5(size_t input_channels, size_t output_size)
{
  add<0, nn::Conv2D>(input_channels,
                     /*output_channels=*/6,
                     /*stride=*/1,
                     /*kernel_size=*/5,
                     /*padding=*/0);
  add<1, nn::ReLU>();
  add<2, nn::AvgPool2D>(/*kernel_size=*/2,
                        /*stride=*/2,
                        /*padding=*/0);
  add<3, nn::Conv2D>(/*input_channels=*/6,
                     /*output_channels=*/16,
                     /*stride=*/1,
                     /*kernel_size=*/5,
                     /*padding=*/0);
  add<4, nn::ReLU>();
  add<5, nn::AvgPool2D>(/*kernel_size=*/2,
                        /*stride=*/2,
                        /*padding=*/0);
  add<6, nn::Flatten>();
  add<7, nn::Linear>(400, 84);
  add<8, nn::ReLU>();
  add<9, nn::Dropout>();  // avoid overfitting
  add<10, nn::Linear>(84, output_size);
}
 
nn::matrix<float> LeNet5::forward(const cv::Mat& image)
{
  // Feature extraction
  cv::Mat im = layers.at(0)->forward(image);  // Conv2D
  im = layers.at(1)->forward(im);  // ReLU
  im = layers.at(2)->forward(im);  // AvgPool2D
  im = layers.at(3)->forward(im);  // Conv2D
  im = layers.at(4)->forward(im);  // ReLU
  im = layers.at(5)->forward(im);  // AvgPool2D
 
  // Flatten an image to 1D vector
  nn::matrix<float> result =
      dynamic_cast<nn::Flatten*>(layers.at(6).get())->forward_im(im);
 
  // Classifier
  result = layers.at(7)->forward(result);  // Linear
  result = layers.at(8)->forward(result);  // ReLU
  result = layers.at(9)->forward(result);  // Dropout
  result = layers.at(10)->forward(result);  // Linear
  return result;
}
 
void LeNet5::backward(const nn::matrix<float>& grad)
{
  nn::matrix<float> result = layers.at(10)->backward(grad);  // Linear
  result = layers.at(9)->backward(result);  // Dropout
  result = layers.at(8)->backward(result);  // ReLU
  result = layers.at(7)->backward(result);  // Linear
 
  // backward 1D vector to image
  cv::Mat im =
      dynamic_cast<nn::Flatten*>(layers.at(6).get())->backward_im(result);
 
  im = layers.at(5)->backward(im);  // AvgPool2D
  im = layers.at(4)->backward(im);  // ReLU
  im = layers.at(3)->backward(im);  // Conv2D
  im = layers.at(2)->backward(im);  // AvgPool2D
  im = layers.at(1)->backward(im);  // ReLU
  im = layers.at(0)->backward(im);  // Conv2D
}

Finally I can finish a side project (not just drop it after few lines of code) 😭.