diff --git a/include/conv2d.hpp b/include/conv2d.hpp new file mode 100644 index 0000000..27baab0 --- /dev/null +++ b/include/conv2d.hpp @@ -0,0 +1,133 @@ +#ifndef CONV2D_HPP +#define CONV2D_HPP + +#include "tensor_load.hpp" +#include "layers.hpp" +#include + +class Conv2D : public Layer { +public: + Conv2D(size_t in_channels, size_t out_channels, std::vector kernel_size, size_t stride = 1, size_t padding = 0) + : in_channels(in_channels), out_channels(out_channels), kernel_size(kernel_size), + stride(stride), padding(padding) { + initialize(); + } + + void initialize() { + // Initialize weights and bias + params.weights = xt::random::randn({out_channels, in_channels, kernel_size[0], kernel_size[1]}); + params.bias = xt::random::randn({out_channels}); + } + + Tensor forward(Tensor inputs) override { + // Use size_t for dimensions + size_t batch_size = inputs.shape(0); + size_t input_channels = inputs.shape(1); + size_t input_height = inputs.shape(2); + size_t input_width = inputs.shape(3); + + std::cout << "Input shape: " << batch_size << "x" << input_channels << "x" << input_height << "x" << input_width << std::endl; + + // Check if stride is not zero + if (stride == 0) { + throw std::runtime_error("Stride cannot be zero"); + } + + // Use integer division for output dimensions + int output_height = (static_cast(input_height) - kernel_size[0] + 2 * padding) / stride + 1; + int output_width = (static_cast(input_width) - kernel_size[1] + 2 * padding) / stride + 1; + + std::cout << "Calculated output height: " << output_height << std::endl; + std::cout << "Calculated output width: " << output_width << std::endl; + + // Check if output dimensions are positive + if (output_height <= 0 || output_width <= 0) { + throw std::runtime_error("Invalid output dimensions"); + } + + Tensor outputs; + try { + outputs = xt::zeros({batch_size, + static_cast(out_channels), + static_cast(output_height), + static_cast(output_width)}); + std::cout << "Output tensor created successfully" << std::endl; + } catch (const std::exception& e) { + std::cerr << "Error creating output tensor: " << e.what() << std::endl; + throw; + } + + // Implement convolution operation + for (size_t b = 0; b < batch_size; ++b) { + for (size_t oc = 0; oc < out_channels; ++oc) { + for (size_t oh = 0; oh < output_height; ++oh) { + for (size_t ow = 0; ow < output_width; ++ow) { + double sum = 0.0; + for (size_t ic = 0; ic < in_channels; ++ic) { + for (size_t kh = 0; kh < kernel_size[0]; ++kh) { + for (size_t kw = 0; kw < kernel_size[1]; ++kw) { + size_t ih = oh * stride + kh - padding; + size_t iw = ow * stride + kw - padding; + if (ih < input_height && iw < input_width) { + sum += inputs(b, ic, ih, iw) * params.weights(oc, ic, kh, kw); + } + } + } + } + outputs(b, oc, oh, ow) = sum + params.bias(oc); + } + } + } + } + + return outputs; +} + + Tensor backward(Tensor grad, Tensor inputs) override { + double batch_size = inputs.shape(0); + double input_height = inputs.shape(2); + double input_width = inputs.shape(3); + + double output_height = grad.shape(2); + double output_width = grad.shape(3); + + Tensor input_grad = xt::zeros(inputs.shape()); + params.grad_weights = xt::zeros(params.weights.shape()); + params.grad_biases = xt::sum(grad, {0, 2, 3}); + + std::cout << "debug 1st backward conv2d" << std::endl; + // Compute gradients + for (double b = 0; b < batch_size; ++b) { + for (double oc = 0; oc < out_channels; ++oc) { + for (double oh = 0; oh < output_height; ++oh) { + for (double ow = 0; ow < output_width; ++ow) { + for (double ic = 0; ic < in_channels; ++ic) { + for (double kh = 0; kh < kernel_size[0]; ++kh) { + for (double kw = 0; kw < kernel_size[1]; ++kw) { + double ih = oh * stride + kh - padding; + double iw = ow * stride + kw - padding; + if (ih >= 0 && ih < input_height && iw >= 0 && iw < input_width) { + double grad_val = grad(b, oc, oh, ow); + input_grad(b, ic, ih, iw) += grad_val * params.weights(oc, ic, kh, kw); + params.grad_weights(oc, ic, kh, kw) += grad_val * inputs(b, ic, ih, iw); + } + } + } + } + } + } + } + } + + return input_grad; + } + +private: + size_t in_channels; + size_t out_channels; + std::vector kernel_size; + size_t stride; + size_t padding; +}; + +#endif \ No newline at end of file diff --git a/include/initialization.hpp b/include/initialization.hpp index 1313053..29044b9 100644 --- a/include/initialization.hpp +++ b/include/initialization.hpp @@ -3,7 +3,12 @@ #include "tensor_load.hpp" -class Glorot{ +class INIT{ + public: + virtual Tensor initialize(double n_rows,double n_cols){} +}; + +class Glorot : public INIT{ public: Tensor initialize(double n_rows,double n_cols){ /*Golrot proposed a method to initialize @@ -18,7 +23,7 @@ class Glorot{ } }; -class He{ +class He : public INIT{ Tensor initialize(double n_rows, double n_cols){ double bound = std::sqrt(6/n_rows); Tensor rand_values = xt::random::rand({n_rows,n_cols}); @@ -28,7 +33,7 @@ class He{ } }; -class LSUV{ +class LSUV : public INIT{ public: double input_stddev; LSUV(double scale){ diff --git a/include/neuralnetwork.hpp b/include/neuralnetwork.hpp index a34a589..4261b61 100644 --- a/include/neuralnetwork.hpp +++ b/include/neuralnetwork.hpp @@ -4,6 +4,7 @@ #include "tensor_load.hpp" #include "layers.hpp" +#include "conv2d.hpp" #include diff --git a/include/train.hpp b/include/train.hpp index a5e1d5b..5eda344 100644 --- a/include/train.hpp +++ b/include/train.hpp @@ -42,6 +42,7 @@ class Train{ std::vector batches = batchit.initialize(inputs, targets); for (size_t i = 0; i < batches.size(); i++) { Tensor predicted = net.forward(batches[i].inputs); + std::cout << "train 2" << std::endl; epoch_loss += mse.loss(predicted, batches[i].targets); Tensor grad = mse.grad(predicted, batches[i].targets); net.backward(grad,batches[i].inputs); diff --git a/src/main.cpp b/src/main.cpp index 65662ca..3bed1ba 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -17,11 +17,13 @@ int main(int argc, char* argv[]) auto linr = std::make_unique(3,3); auto linr2 = std::make_unique(3,3); auto linr3 = std::make_unique(3,3); + auto conv2d = std::make_unique(3,3,std::vector {1,1}); auto tanh_obj = std::make_unique(); std::vector> layers; - layers.push_back(std::move(linr)); - layers.push_back(std::move(linr2)); + // layers.push_back(std::move(linr)); + // layers.push_back(std::move(linr2)); + layers.push_back(std::move(conv2d)); layers.push_back(std::move(linr3)); layers.push_back(std::move(tanh_obj));