diff --git a/src/stan/io/opencl/deserializer.hpp b/src/stan/io/opencl/deserializer.hpp new file mode 100644 index 00000000000..ad3ebd62d5f --- /dev/null +++ b/src/stan/io/opencl/deserializer.hpp @@ -0,0 +1,692 @@ +#ifndef STAN_IO_OPENCL_DESERIALIZER_HPP +#define STAN_IO_OPENCL_DESERIALIZER_HPP + +#ifdef STAN_OPENCL + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +namespace stan { +namespace io { + +/** + * OpenCL deserializer specialization for matrix_cl buffers (prim). + */ +template <> +class deserializer> { + using mat_t = stan::math::matrix_cl; + private: + const mat_t& data_r_; + Eigen::Map> map_i_; + size_t r_size_{0}; + size_t i_size_{0}; + size_t pos_r_{0}; + size_t pos_i_{0}; + size_t align_elems_{1}; + + /** + * Check there are at least m reals left to read. + * + * @param m Number of real elements to read. + * @throws std::runtime_error if there are insufficient elements. + */ + void check_r_capacity(size_t m) const { + STAN_NO_RANGE_CHECKS_RETURN; + if (pos_r_ + m > r_size_) { + []() STAN_COLD_PATH { + throw std::runtime_error("no more scalars to read"); + }(); + } + } + + /** + * Check there are at least m integers left to read. + * + * @param m Number of integer elements to read. + * @throws std::runtime_error if there are insufficient elements. + */ + void check_i_capacity(size_t m) const { + STAN_NO_RANGE_CHECKS_RETURN; + if (pos_i_ + m > i_size_) { + []() STAN_COLD_PATH { + throw std::runtime_error("no more integers to read"); + }(); + } + } + + /** + * Align the real position to the next aligned element offset. + */ + inline void align_pos() { pos_r_ = internal::round_up(pos_r_, align_elems_); } + + /** + * Read a block as a matrix_cl subbuffer. + * + * @param size Block size in elements. + * @param rows Number of rows. + * @param cols Number of cols. + * @return matrix_cl wrapping the subbuffer. + * @throws std::runtime_error if there are insufficient elements. + * @throws cl::Error if subbuffer creation fails. + */ + inline mat_t read_matrix_cl_(size_t size, int rows, + int cols) { + align_pos(); + if (size == 0) { + return mat_t(rows, cols); + } + check_r_capacity(size); + const size_t origin_bytes = pos_r_ * sizeof(double); + const size_t size_bytes = size * sizeof(double); + cl_buffer_region region{origin_bytes, size_bytes}; + cl::Buffer parent = data_r_.buffer(); + cl::Buffer sub = parent.createSubBuffer( + CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion); + pos_r_ += size; + align_pos(); + return mat_t(sub, rows, cols); + } + + public: + /** + * Construct a deserializer over host buffers. + * + * @tparam RVec real data vector type. + * @tparam IntVec integer data vector type. + * @param data_r Real data. + * @param data_i Integer data. + */ + template * = nullptr> + deserializer(const RVec& data_r, const IntVec& data_i) + : data_r_(data_r), + map_i_(data_i.data(), data_i.size()), + r_size_(data_r.size()), + i_size_(data_i.size()) {} + + /** + * Construct a deserializer over an OpenCL buffer with alignment. + * + * @tparam IntVec integer data vector type. + * @param data_r Device buffer of reals. + * @param data_i Integer data. + * @param align_elems Alignment in elements. + */ + template * = nullptr> + deserializer(const mat_t& data_r, const IntVec& data_i, + size_t align_elems) + : data_r_(data_r), + map_i_(data_i.data(), data_i.size()), + r_size_(data_r.size()), + i_size_(data_i.size()), + align_elems_(std::max(1, align_elems)) {} + + /** + * @return Number of remaining real elements. + */ + inline size_t available() const noexcept { return r_size_ - pos_r_; } + /** + * @return Number of remaining integer elements. + */ + inline size_t available_i() const noexcept { return i_size_ - pos_i_; } + + /** + * Read a scalar floating-point value. + * + * @tparam Ret Floating point type. + * @return Scalar value. + * @throws std::runtime_error if there are insufficient elements. + * @throws cl::Error if subbuffer creation fails. + */ + template >* = nullptr> + inline Ret read() { + auto cl_val = read_matrix_cl_(1, 1, 1); + return stan::math::from_matrix_cl(cl_val); + } + + /** + * Read a complex scalar (two consecutive reals). + * + * @tparam Ret Complex type. + * @return Complex value. + * @throws std::runtime_error if there are insufficient elements. + * @throws cl::Error if subbuffer creation fails. + */ + template * = nullptr> + inline Ret read() { + auto real = this->read(); + auto imag = this->read(); + return std::complex{real, imag}; + } + + /** + * Read an integer value. + * + * @tparam Ret Integral type. + * @return Integer value. + * @throws std::runtime_error if there are insufficient elements. + */ + template * = nullptr> + inline Ret read() { + check_i_capacity(1); + return map_i_.coeffRef(pos_i_++); + } + + /** + * Read a matrix_cl with the given dimensions. + * + * @tparam Ret matrix_cl type. + * @param rows Rows. + * @param cols Cols. + * @return matrix_cl view of the subbuffer. + * @throws std::runtime_error if there are insufficient elements. + * @throws cl::Error if subbuffer creation fails. + */ + template * = nullptr> + inline Ret read(Eigen::Index rows, Eigen::Index cols) { + return read_matrix_cl_(static_cast(rows * cols), + static_cast(rows), static_cast(cols)); + } + + /** + * Read a vector (matrix_cl with cols=1). + * + * @tparam Ret matrix_cl type. + * @param m Length. + * @return matrix_cl view of the subbuffer. + * @throws std::runtime_error if there are insufficient elements. + * @throws cl::Error if subbuffer creation fails. + */ + template * = nullptr> + inline Ret read(Eigen::Index m) { + return read_matrix_cl_(static_cast(m), static_cast(m), 1); + } + + /** + * Read a std::vector of elements. + * + * @tparam Ret std::vector type. + * @param m Vector length. + * @param dims Dimensions for each element. + * @return std::vector of deserialized elements. + * @throws std::runtime_error if there are insufficient elements. + * @throws cl::Error if subbuffer creation fails. + */ + template * = nullptr> + inline auto read(Eigen::Index m, Sizes... dims) { + std::decay_t ret_vec; + if (unlikely(m == 0)) { + return ret_vec; + } + ret_vec.reserve(m); + for (size_t i = 0; i < static_cast(m); ++i) { + ret_vec.emplace_back(this->read>(dims...)); + } + return ret_vec; + } + + /** + * Read with lower-bound constraint. + * + * @tparam Ret Return type. + * @tparam Jacobian Whether to include Jacobian. + * @tparam LB Lower bound type. + * @tparam LP Log probability accumulator type. + * @param lb Lower bound. + * @param lp Log probability accumulator. + * @param sizes Dimensions for the read. + * @return Constrained value. + * @throws std::runtime_error if there are insufficient elements. + * @throws cl::Error if subbuffer creation fails. + */ + template + inline auto read_constrain_lb(const LB& lb, LP& lp, Sizes... sizes) { + return stan::math::lb_constrain(this->read(sizes...), lb, lp); + } + + /** + * Read with upper-bound constraint. + * + * @tparam Ret Return type. + * @tparam Jacobian Whether to include Jacobian. + * @tparam UB Upper bound type. + * @tparam LP Log probability accumulator type. + * @param ub Upper bound. + * @param lp Log probability accumulator. + * @param sizes Dimensions for the read. + * @return Constrained value. + * @throws std::runtime_error if there are insufficient elements. + * @throws cl::Error if subbuffer creation fails. + */ + template + inline auto read_constrain_ub(const UB& ub, LP& lp, Sizes... sizes) { + return stan::math::ub_constrain(this->read(sizes...), ub, lp); + } + + /** + * Read with lower/upper-bound constraint. + * + * @tparam Ret Return type. + * @tparam Jacobian Whether to include Jacobian. + * @tparam LB Lower bound type. + * @tparam UB Upper bound type. + * @tparam LP Log probability accumulator type. + * @param lb Lower bound. + * @param ub Upper bound. + * @param lp Log probability accumulator. + * @param sizes Dimensions for the read. + * @return Constrained value. + * @throws std::runtime_error if there are insufficient elements. + * @throws cl::Error if subbuffer creation fails. + */ + template + inline auto read_constrain_lub(const LB& lb, const UB& ub, LP& lp, + Sizes... sizes) { + return stan::math::lub_constrain(this->read(sizes...), lb, ub, + lp); + } + + /** + * Read with offset-multiplier constraint. + * + * @tparam Ret Return type. + * @tparam Jacobian Whether to include Jacobian. + * @tparam M Offset type. + * @tparam S Multiplier type. + * @tparam LP Log probability accumulator type. + * @param mu Offset. + * @param sigma Multiplier. + * @param lp Log probability accumulator. + * @param sizes Dimensions for the read. + * @return Constrained value. + * @throws std::runtime_error if there are insufficient elements. + * @throws cl::Error if subbuffer creation fails. + */ + template + inline auto read_constrain_offset_multiplier(const M& mu, const S& sigma, + LP& lp, Sizes... sizes) { + return stan::math::offset_multiplier_constrain( + this->read(sizes...), mu, sigma, lp); + } + + /** + * Read with unit-vector constraint. + * + * @tparam Ret Return type. + * @tparam Jacobian Whether to include Jacobian. + * @tparam LP Log probability accumulator type. + * @param lp Log probability accumulator. + * @param sizes Dimensions for the read. + * @return Constrained value. + * @throws std::runtime_error if there are insufficient elements. + * @throws cl::Error if subbuffer creation fails. + */ + template + inline auto read_constrain_unit_vector(LP& lp, Sizes... sizes) { + return stan::math::unit_vector_constrain(this->read(sizes...), + lp); + } +}; + +/** + * OpenCL deserializer specialization for var_value buffers (rev). + */ +template <> +class deserializer>> { + using mat_t = stan::math::matrix_cl; + private: + std::reference_wrapper val_; + std::reference_wrapper adj_; + Eigen::Map> map_i_; + size_t r_size_{0}; + size_t i_size_{0}; + size_t pos_r_{0}; + size_t pos_i_{0}; + size_t align_elems_{1}; + + /** + * Check there are at least m reals left to read. + * + * @param m Number of real elements to read. + * @throws std::runtime_error if there are insufficient elements. + */ + void check_r_capacity(size_t m) const { + STAN_NO_RANGE_CHECKS_RETURN; + if (pos_r_ + m > r_size_) { + []() STAN_COLD_PATH { + throw std::runtime_error("no more scalars to read"); + }(); + } + } + + /** + * Check there are at least m integers left to read. + * + * @param m Number of integer elements to read. + * @throws std::runtime_error if there are insufficient elements. + */ + void check_i_capacity(size_t m) const { + STAN_NO_RANGE_CHECKS_RETURN; + if (pos_i_ + m > i_size_) { + []() STAN_COLD_PATH { + throw std::runtime_error("no more integers to read"); + }(); + } + } + + /** + * Align the real position to the next aligned element offset. + */ + inline void align_pos() { pos_r_ = internal::round_up(pos_r_, align_elems_); } + + /** + * Read a block as a var_value subbuffer. + * + * @param size Block size in elements. + * @param rows Number of rows. + * @param cols Number of cols. + * @return var_value wrapping value and adjoint subbuffers. + * @throws std::runtime_error if there are insufficient elements. + * @throws cl::Error if subbuffer creation fails. + */ + inline stan::math::var_value + read_var_matrix_cl_(size_t size, int rows, int cols) { + align_pos(); + if (size == 0) { + mat_t empty_val(rows, cols); + mat_t empty_adj(rows, cols); + auto* vi = new stan::math::vari_value( + std::move(empty_val), std::move(empty_adj)); + return stan::math::var_value(vi); + } + check_r_capacity(size); + const size_t origin_bytes = pos_r_ * sizeof(double); + const size_t size_bytes = size * sizeof(double); + cl_buffer_region region{origin_bytes, size_bytes}; + cl::Buffer sub_val = val_.get().buffer().createSubBuffer( + CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion); + cl::Buffer sub_adj = adj_.get().buffer().createSubBuffer( + CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion); + pos_r_ += size; + align_pos(); + mat_t val_mat(std::move(sub_val), rows, cols); + mat_t adj_mat(std::move(sub_adj), rows, cols); + auto* vi = new stan::math::vari_value( + std::move(val_mat), std::move(adj_mat)); + return stan::math::var_value(vi); + } + + public: + /** + * Construct a deserializer over a var_value buffer. + * + * @tparam IntVec integer data vector type. + * @param data_r Device buffer with values and adjoints. + * @param data_i Integer data. + * @param align_elems Alignment in elements. + */ + template * = nullptr> + deserializer(const stan::math::var_value& data_r, + const IntVec& data_i, size_t align_elems) + : val_(data_r.val()), + adj_(data_r.adj()), + map_i_(data_i.data(), data_i.size()), + r_size_(data_r.val().size()), + i_size_(data_i.size()), + align_elems_(std::max(1, align_elems)) {} + + /** + * @return Number of remaining real elements. + */ + inline size_t available() const noexcept { return r_size_ - pos_r_; } + /** + * @return Number of remaining integer elements. + */ + inline size_t available_i() const noexcept { return i_size_ - pos_i_; } + + /** + * Read a scalar floating-point value. + * + * @tparam Ret Floating point type. + * @return Scalar value. + * @throws std::runtime_error if there are insufficient elements. + * @throws cl::Error if subbuffer creation fails. + */ + template >* = nullptr> + inline Ret read() { + auto cl_val = read_var_matrix_cl_(1, 1, 1).val(); + return stan::math::from_matrix_cl(cl_val); + } + + /** + * Read a complex scalar (two consecutive reals). + * + * @tparam Ret Complex type. + * @return Complex value. + * @throws std::runtime_error if there are insufficient elements. + * @throws cl::Error if subbuffer creation fails. + */ + template * = nullptr> + inline Ret read() { + auto real = this->read(); + auto imag = this->read(); + return std::complex{real, imag}; + } + + /** + * Read an integer value. + * + * @tparam Ret Integral type. + * @return Integer value. + * @throws std::runtime_error if there are insufficient elements. + */ + template * = nullptr> + inline Ret read() { + check_i_capacity(1); + return map_i_.coeffRef(pos_i_++); + } + + /** + * Read a var_value with the given dimensions. + * + * @tparam Ret var_value type. + * @param rows Rows. + * @param cols Cols. + * @return var_value view of the subbuffers. + * @throws std::runtime_error if there are insufficient elements. + * @throws cl::Error if subbuffer creation fails. + */ + template >>* = nullptr> + inline Ret read(Eigen::Index rows, Eigen::Index cols) { + return read_var_matrix_cl_(static_cast(rows * cols), + static_cast(rows), + static_cast(cols)); + } + + /** + * Read a vector as var_value (cols=1). + * + * @tparam Ret var_value type. + * @param m Length. + * @return var_value view of the subbuffers. + * @throws std::runtime_error if there are insufficient elements. + * @throws cl::Error if subbuffer creation fails. + */ + template >>* = nullptr> + inline Ret read(Eigen::Index m) { + return read_var_matrix_cl_(static_cast(m), static_cast(m), 1); + } + + /** + * Read a std::vector of elements. + * + * @tparam Ret std::vector type. + * @param m Vector length. + * @param dims Dimensions for each element. + * @return std::vector of deserialized elements. + * @throws std::runtime_error if there are insufficient elements. + * @throws cl::Error if subbuffer creation fails. + */ + template * = nullptr> + inline auto read(Eigen::Index m, Sizes... dims) { + std::decay_t ret_vec; + if (unlikely(m == 0)) { + return ret_vec; + } + ret_vec.reserve(m); + for (size_t i = 0; i < static_cast(m); ++i) { + ret_vec.emplace_back(this->read>(dims...)); + } + return ret_vec; + } + + /** + * Read with lower-bound constraint. + * + * @tparam Ret Return type. + * @tparam Jacobian Whether to include Jacobian. + * @tparam LB Lower bound type. + * @tparam LP Log probability accumulator type. + * @param lb Lower bound. + * @param lp Log probability accumulator. + * @param sizes Dimensions for the read. + * @return Constrained value. + * @throws std::runtime_error if there are insufficient elements. + * @throws cl::Error if subbuffer creation fails. + */ + template + inline auto read_constrain_lb(const LB& lb, LP& lp, Sizes... sizes) { + return stan::math::lb_constrain(this->read(sizes...), lb, lp); + } + + /** + * Read with upper-bound constraint. + * + * @tparam Ret Return type. + * @tparam Jacobian Whether to include Jacobian. + * @tparam UB Upper bound type. + * @tparam LP Log probability accumulator type. + * @param ub Upper bound. + * @param lp Log probability accumulator. + * @param sizes Dimensions for the read. + * @return Constrained value. + * @throws std::runtime_error if there are insufficient elements. + * @throws cl::Error if subbuffer creation fails. + */ + template + inline auto read_constrain_ub(const UB& ub, LP& lp, Sizes... sizes) { + return stan::math::ub_constrain(this->read(sizes...), ub, lp); + } + + /** + * Read with lower/upper-bound constraint. + * + * @tparam Ret Return type. + * @tparam Jacobian Whether to include Jacobian. + * @tparam LB Lower bound type. + * @tparam UB Upper bound type. + * @tparam LP Log probability accumulator type. + * @param lb Lower bound. + * @param ub Upper bound. + * @param lp Log probability accumulator. + * @param sizes Dimensions for the read. + * @return Constrained value. + * @throws std::runtime_error if there are insufficient elements. + * @throws cl::Error if subbuffer creation fails. + */ + template + inline auto read_constrain_lub(const LB& lb, const UB& ub, LP& lp, + Sizes... sizes) { + return stan::math::lub_constrain(this->read(sizes...), lb, ub, + lp); + } + + /** + * Read with offset-multiplier constraint. + * + * @tparam Ret Return type. + * @tparam Jacobian Whether to include Jacobian. + * @tparam M Offset type. + * @tparam S Multiplier type. + * @tparam LP Log probability accumulator type. + * @param mu Offset. + * @param sigma Multiplier. + * @param lp Log probability accumulator. + * @param sizes Dimensions for the read. + * @return Constrained value. + * @throws std::runtime_error if there are insufficient elements. + * @throws cl::Error if subbuffer creation fails. + */ + template + inline auto read_constrain_offset_multiplier(const M& mu, const S& sigma, + LP& lp, Sizes... sizes) { + return stan::math::offset_multiplier_constrain( + this->read(sizes...), mu, sigma, lp); + } + + /** + * Read with unit-vector constraint. + * + * @tparam Ret Return type. + * @tparam Jacobian Whether to include Jacobian. + * @tparam LP Log probability accumulator type. + * @param lp Log probability accumulator. + * @param sizes Dimensions for the read. + * @return Constrained value. + * @throws std::runtime_error if there are insufficient elements. + * @throws cl::Error if subbuffer creation fails. + */ + template + inline auto read_constrain_unit_vector(LP& lp, Sizes... sizes) { + return stan::math::unit_vector_constrain(this->read(sizes...), + lp); + } +}; + +} // namespace io +} // namespace stan + +#endif // STAN_OPENCL + +#endif // STAN_IO_OPENCL_DESERIALIZER_HPP diff --git a/src/stan/io/opencl/utils.hpp b/src/stan/io/opencl/utils.hpp new file mode 100644 index 00000000000..1b36f43988a --- /dev/null +++ b/src/stan/io/opencl/utils.hpp @@ -0,0 +1,231 @@ +#ifndef STAN_IO_OPENCL_UTILS_HPP +#define STAN_IO_OPENCL_UTILS_HPP + +#ifdef STAN_OPENCL + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +namespace stan { +namespace io { + +/** + * Layout information for aligned OpenCL serializer buffers. + * + * Offsets and sizes are in elements (not bytes). + */ +struct serializer_layout { + /** Aligned offsets in elements for each block. */ + std::vector offsets; + /** Block sizes in elements. */ + std::vector sizes; + /** Total size (including padding) in elements. */ + size_t total_size{0}; + /** Alignment in elements. */ + size_t align_elems{1}; +}; + +namespace internal { +/** + * Round a value up to the next multiple. + * + * @param value Value to round up. + * @param multiple Alignment multiple. + * @return Rounded value (>= value). + */ +inline size_t round_up(size_t value, size_t multiple) { + if (multiple == 0) { + return value; + } + const size_t rem = value % multiple; + return rem == 0 ? value : value + (multiple - rem); +} + +/** + * Query the device alignment and return required alignment in elements. + * + * @return Alignment in elements for double buffers. + * @throws cl::Error if OpenCL device queries fail. + */ +inline size_t align_elems_from_device() { + size_t align_bits + = stan::math::opencl_context.device()[0] + .getInfo(); + size_t align_bytes = (align_bits + 7) / 8; + if (align_bytes == 0) { + return 1; + } + const size_t elem_size = sizeof(double); + const size_t g = std::gcd(align_bytes, elem_size); + const size_t align_elems = align_bytes / (g == 0 ? 1 : g); + return align_elems == 0 ? 1 : align_elems; +} +} // namespace internal + +/** + * Compute aligned offsets and total size for parameter blocks. + * + * @param sizes Block sizes in read order (elements). + * @param align_elems Alignment in elements. + * @return Layout with aligned offsets and total size. + */ +inline serializer_layout compute_serializer_layout( + const std::vector& sizes, size_t align_elems) { + serializer_layout layout; + layout.sizes = sizes; + layout.align_elems = std::max(1, align_elems); + layout.offsets.reserve(sizes.size()); + size_t pos = 0; + for (size_t size : sizes) { + pos = internal::round_up(pos, layout.align_elems); + layout.offsets.push_back(pos); + pos += size; + } + layout.total_size = pos; + return layout; +} + +/** + * Allocate a matrix_cl buffer sized to the serialized layout. + * + * @param layout Serializer layout. + * @param flags OpenCL memory flags. + * @return Device buffer with shape (layout.total_size, 1). + * @throws std::system_error if an OpenCL error occurs. + */ +inline stan::math::matrix_cl allocate_serializer_buffer( + const serializer_layout& layout, cl_mem_flags flags) { + if (layout.total_size == 0) { + return stan::math::matrix_cl(); + } + cl::Context& ctx = stan::math::opencl_context.context(); + try { + cl_mem_flags alloc_flags = flags; + if (stan::math::opencl_context.device()[0] + .getInfo()) { + alloc_flags |= CL_MEM_ALLOC_HOST_PTR; + } + cl::Buffer buffer(ctx, alloc_flags, sizeof(double) * layout.total_size); + return stan::math::matrix_cl(buffer, layout.total_size, 1); + } catch (const cl::Error& e) { + stan::math::check_opencl_error("allocate_serializer_buffer", e); + } + return stan::math::matrix_cl(); +} + +/** + * Copy host parameters into aligned blocks of a device buffer. + * + * @param src Flat parameters with no padding. + * @param dst Device buffer to receive padded blocks. + * @param layout Serializer layout describing offsets and sizes. + * @throws std::invalid_argument if src size does not match sum of sizes. + * @throws std::system_error if an OpenCL error occurs. + */ +inline void copy_to_serialize_buffer(const Eigen::VectorXd& src, + stan::math::matrix_cl& dst, + const serializer_layout& layout) { + const size_t total_src = static_cast(src.size()); + const size_t total_sizes + = std::accumulate(layout.sizes.begin(), layout.sizes.end(), size_t{0}); + stan::math::check_size_match("copy_to_serialize_buffer", "src.size()", + total_src, "sum(sizes)", total_sizes); + if (layout.total_size == 0) { + return; + } + + auto& queue = stan::math::opencl_context.queue(); + std::vector events; + events.reserve(layout.sizes.size()); + size_t src_offset = 0; + + try { + for (size_t i = 0; i < layout.sizes.size(); ++i) { + const size_t block_size = layout.sizes[i]; + if (block_size == 0) { + continue; + } + const size_t origin_bytes = layout.offsets[i] * sizeof(double); + const size_t size_bytes = block_size * sizeof(double); + cl_buffer_region region{origin_bytes, size_bytes}; + cl::Buffer sub_dst = dst.buffer().createSubBuffer( + CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion); + + cl::Event event; + queue.enqueueWriteBuffer(sub_dst, CL_FALSE, 0, size_bytes, + src.data() + src_offset, nullptr, &event); + events.push_back(event); + src_offset += block_size; + } + + for (cl::Event& e : events) { + e.wait(); + } + } catch (const cl::Error& e) { + stan::math::check_opencl_error("copy_to_serialize_buffer", e); + } +} + +/** + * Serialize host parameters into a padded OpenCL buffer. + * + * @param params Flat unconstrained parameters. + * @param dimss Parameter dimensions (unused, passed for API parity). + * @param sizes Unconstrained block sizes. + * @return var_value holding values and adjoints buffers. + * @throws std::system_error if an OpenCL error occurs. + * @throws std::invalid_argument if params size does not match sum of sizes. + */ +inline stan::math::var_value> serialize_to_opencl( + const Eigen::VectorXd& params, + const std::vector>& dimss, + const std::vector& sizes) { + (void)dimss; + const size_t align_elems = internal::align_elems_from_device(); + const serializer_layout layout = compute_serializer_layout(sizes, align_elems); + + stan::math::matrix_cl values + = allocate_serializer_buffer(layout, CL_MEM_READ_ONLY); + stan::math::matrix_cl adjoints + = allocate_serializer_buffer(layout, CL_MEM_READ_WRITE); + + if (layout.total_size > 0) { + auto& queue = stan::math::opencl_context.queue(); + try { + double zero = 0.0; + queue.enqueueFillBuffer(values.buffer(), zero, 0, + sizeof(double) * layout.total_size); + queue.enqueueFillBuffer(adjoints.buffer(), zero, 0, + sizeof(double) * layout.total_size); + queue.finish(); + } catch (const cl::Error& e) { + stan::math::check_opencl_error("serialize_to_opencl", e); + } + } + + copy_to_serialize_buffer(params, values, layout); + + auto* vi = new stan::math::vari_value>( + std::move(values), std::move(adjoints)); + return stan::math::var_value>(vi); +} + +} // namespace io +} // namespace stan + +#endif // STAN_OPENCL + +#endif // STAN_IO_OPENCL_UTILS_HPP diff --git a/src/stan/model/model_base.hpp b/src/stan/model/model_base.hpp index ad92415330d..d070ec4a728 100644 --- a/src/stan/model/model_base.hpp +++ b/src/stan/model/model_base.hpp @@ -6,9 +6,13 @@ #endif #include #include +#ifdef STAN_OPENCL +#include +#endif #include #include #include +#include #include #include #include @@ -203,6 +207,41 @@ class model_base : public prob_grad { virtual math::var log_prob(Eigen::Matrix& params_r, std::ostream* msgs) const = 0; +#ifdef STAN_OPENCL + /** + * Return the log density for the specified OpenCL unconstrained parameters, + * without Jacobian and with normalizing constants for probability functions. + * + * @param[in] params_r unconstrained parameters on the device + * @param[in,out] msgs message stream + * @return log density for specified parameters + */ + virtual math::var log_prob(math::matrix_cl& params_r, + std::ostream* msgs) const { + static_cast(params_r); + static_cast(msgs); + throw std::runtime_error( + "OpenCL log_prob not implemented for this model."); + } + + /** + * Return the log density for the specified OpenCL unconstrained parameters, + * without Jacobian and with normalizing constants for probability functions. + * + * @param[in] params_r unconstrained parameters on the device with adjoints + * @param[in,out] msgs message stream + * @return log density for specified parameters + */ + virtual math::var log_prob( + math::var_value>& params_r, + std::ostream* msgs) const { + static_cast(params_r); + static_cast(msgs); + throw std::runtime_error( + "OpenCL log_prob not implemented for this model."); + } +#endif + /** * Return the log density for the specified unconstrained * parameters, with Jacobian correction for constraints and with diff --git a/src/stan/model/model_base_crtp.hpp b/src/stan/model/model_base_crtp.hpp index cfb54a91ad5..c3cca5a1ed2 100644 --- a/src/stan/model/model_base_crtp.hpp +++ b/src/stan/model/model_base_crtp.hpp @@ -53,6 +53,19 @@ namespace model { * std::ostream* msgs = 0) const * ``` * + * When STAN_OPENCL is defined, the derived class may also implement + * the OpenCL overloads to enable OpenCL deserialization: + * + * ``` + * math::var log_prob(math::matrix_cl& params_r, + * std::ostream* msgs = 0) const; + * math::var log_prob(math::var_value>& params_r, + * std::ostream* msgs = 0) const; + * ``` + * + * If these overloads are not provided, calling the OpenCL log_prob overload + * will throw at runtime. + * *

The derived class `M` must be declared following the curiously * recursive template pattern, for example, if `M` is `foo_model`, * then `foo_model` should be declared as diff --git a/src/stan/model/model_header.hpp b/src/stan/model/model_header.hpp index a3896ecc6cb..a603a90eb6f 100644 --- a/src/stan/model/model_header.hpp +++ b/src/stan/model/model_header.hpp @@ -5,6 +5,12 @@ #include #include +#ifdef STAN_OPENCL +#include +#include +#include +#endif + #include #include diff --git a/src/test/unit/io/deserializer_opencl_stdvector_test.cpp b/src/test/unit/io/deserializer_opencl_stdvector_test.cpp new file mode 100644 index 00000000000..6634c4236cc --- /dev/null +++ b/src/test/unit/io/deserializer_opencl_stdvector_test.cpp @@ -0,0 +1,89 @@ +#ifdef STAN_OPENCL +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { +void append_sizes(std::vector& sizes, size_t count, size_t size) { + sizes.insert(sizes.end(), count, size); +} + +Eigen::VectorXd make_params(const std::vector& sizes) { + const size_t total = std::accumulate(sizes.begin(), sizes.end(), size_t{0}); + Eigen::VectorXd params(static_cast(total)); + for (Eigen::Index i = 0; i < params.size(); ++i) { + params.coeffRef(i) = static_cast(i + 1); + } + return params; +} +} // namespace + +TEST(deserializer_opencl_stdvector, read_varied_containers) { + std::vector theta_i; + + std::vector sizes; + append_sizes(sizes, 4, 1); // std::vector(4) + append_sizes(sizes, 4, 1); // std::vector>(2) + append_sizes(sizes, 2, 4); // std::vector(2, 2x2) + append_sizes(sizes, 6, 2); // std::vector>(2,3,1x2) + + Eigen::VectorXd params = make_params(sizes); + auto align_elems = stan::io::internal::align_elems_from_device(); + auto layout = stan::io::compute_serializer_layout(sizes, align_elems); + auto values = stan::io::allocate_serializer_buffer(layout, CL_MEM_READ_ONLY); + stan::io::copy_to_serialize_buffer(params, values, layout); + + std::vector params_vec(params.data(), + params.data() + params.size()); + stan::io::deserializer cpu(params_vec, theta_i); + stan::io::deserializer> deserializer(values, + theta_i, + align_elems); + + auto scalars = deserializer.read>(4); + auto scalars_ref = cpu.read>(4); + ASSERT_EQ(scalars.size(), scalars_ref.size()); + for (size_t i = 0; i < scalars.size(); ++i) { + EXPECT_FLOAT_EQ(scalars_ref[i], scalars[i]); + } + + auto complex_vals = deserializer.read>>(2); + auto complex_ref = cpu.read>>(2); + ASSERT_EQ(complex_vals.size(), complex_ref.size()); + for (size_t i = 0; i < complex_vals.size(); ++i) { + EXPECT_FLOAT_EQ(complex_ref[i].real(), complex_vals[i].real()); + EXPECT_FLOAT_EQ(complex_ref[i].imag(), complex_vals[i].imag()); + } + + auto mats = deserializer.read>>( + 2, 2, 2); + auto mats_ref = cpu.read>(2, 2, 2); + ASSERT_EQ(mats.size(), mats_ref.size()); + for (size_t i = 0; i < mats.size(); ++i) { + Eigen::MatrixXd mat = stan::math::from_matrix_cl(mats[i]); + stan::test::expect_near_rel("deserializer_opencl", mat, mats_ref[i]); + } + + auto nested = deserializer.read< + std::vector>>>(2, 3, 1, 2); + auto nested_ref + = cpu.read>>(2, 3, 1, 2); + ASSERT_EQ(nested.size(), nested_ref.size()); + for (size_t i = 0; i < nested.size(); ++i) { + ASSERT_EQ(nested[i].size(), nested_ref[i].size()); + for (size_t j = 0; j < nested[i].size(); ++j) { + Eigen::MatrixXd mat = stan::math::from_matrix_cl(nested[i][j]); + stan::test::expect_near_rel("deserializer_opencl", mat, + nested_ref[i][j]); + } + } +} +#else +#include +TEST(deserializer_opencl_stdvector, dummy) { EXPECT_NO_THROW(); } +#endif diff --git a/src/test/unit/io/deserializer_opencl_test.cpp b/src/test/unit/io/deserializer_opencl_test.cpp new file mode 100644 index 00000000000..acd7efa9b07 --- /dev/null +++ b/src/test/unit/io/deserializer_opencl_test.cpp @@ -0,0 +1,210 @@ +#ifdef STAN_OPENCL +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { +struct opencl_pack { + stan::math::matrix_cl values; + stan::io::serializer_layout layout; + size_t align_elems; +}; + +opencl_pack pack_opencl_values(const Eigen::VectorXd& params, + const std::vector& sizes) { + opencl_pack pack; + pack.align_elems = stan::io::internal::align_elems_from_device(); + pack.layout = stan::io::compute_serializer_layout(sizes, pack.align_elems); + pack.values = stan::io::allocate_serializer_buffer(pack.layout, + CL_MEM_READ_ONLY); + stan::io::copy_to_serialize_buffer(params, pack.values, pack.layout); + return pack; +} + +Eigen::VectorXd make_params(const std::vector& sizes) { + const size_t total = std::accumulate(sizes.begin(), sizes.end(), size_t{0}); + Eigen::VectorXd params(static_cast(total)); + for (Eigen::Index i = 0; i < params.size(); ++i) { + params.coeffRef(i) = static_cast(i + 1); + } + return params; +} +} // namespace + +TEST(deserializer_opencl_mixed, read_scalar_complex_vector_matrix) { + std::vector theta_i{7}; + std::vector sizes{1, 1, 1, 3, 4, 4}; + Eigen::VectorXd params = make_params(sizes); + auto pack = pack_opencl_values(params, sizes); + + std::vector params_vec(params.data(), + params.data() + params.size()); + stan::io::deserializer cpu(params_vec, theta_i); + stan::io::deserializer> deserializer( + pack.values, theta_i, pack.align_elems); + + double x = deserializer.read(); + EXPECT_FLOAT_EQ(cpu.read(), x); + + std::complex z = deserializer.read>(); + std::complex z_ref = cpu.read>(); + EXPECT_FLOAT_EQ(z_ref.real(), z.real()); + EXPECT_FLOAT_EQ(z_ref.imag(), z.imag()); + + auto vec_cl = deserializer.read>(3); + Eigen::VectorXd vec = stan::math::from_matrix_cl(vec_cl); + Eigen::VectorXd vec_ref = cpu.read(3); + stan::test::expect_near_rel("deserializer_opencl", vec, vec_ref); + + auto row_cl = deserializer.read>(1, 4); + Eigen::MatrixXd row = stan::math::from_matrix_cl(row_cl); + Eigen::RowVectorXd row_ref = cpu.read(4); + stan::test::expect_near_rel("deserializer_opencl", row, + Eigen::MatrixXd(row_ref)); + + auto mat_cl = deserializer.read>(2, 2); + Eigen::MatrixXd mat = stan::math::from_matrix_cl(mat_cl); + Eigen::MatrixXd mat_ref = cpu.read(2, 2); + stan::test::expect_near_rel("deserializer_opencl", mat, mat_ref); + + int i = deserializer.read(); + EXPECT_EQ(cpu.read(), i); +} + +TEST(deserializer_opencl_constraints, read_lb) { + std::vector theta_i; + std::vector sizes{3}; + Eigen::VectorXd params = make_params(sizes); + auto pack = pack_opencl_values(params, sizes); + + std::vector params_vec(params.data(), + params.data() + params.size()); + stan::io::deserializer cpu(params_vec, theta_i); + stan::io::deserializer> deserializer( + pack.values, theta_i, pack.align_elems); + + double lp = 0.0; + auto lb_cl + = deserializer.read_constrain_lb, true>( + -1.0, lp, 3); + double lp_ref = 0.0; + auto lb_ref = stan::math::lb_constrain(cpu.read(3), + -1.0, lp_ref); + Eigen::VectorXd lb_host + = stan::math::from_matrix_cl(lb_cl); + stan::test::expect_near_rel("deserializer_opencl", lb_host, lb_ref); + EXPECT_NEAR(lp_ref, lp, 1e-8); +} + +TEST(deserializer_opencl_constraints, read_ub) { + std::vector theta_i; + std::vector sizes{2}; + Eigen::VectorXd params = make_params(sizes); + auto pack = pack_opencl_values(params, sizes); + + std::vector params_vec(params.data(), + params.data() + params.size()); + stan::io::deserializer cpu(params_vec, theta_i); + stan::io::deserializer> deserializer( + pack.values, theta_i, pack.align_elems); + + double lp = 0.0; + auto ub_cl + = deserializer.read_constrain_ub, true>( + 2.0, lp, 2); + double lp_ref = 0.0; + auto ub_ref = stan::math::ub_constrain(cpu.read(2), + 2.0, lp_ref); + Eigen::VectorXd ub_host + = stan::math::from_matrix_cl(ub_cl); + stan::test::expect_near_rel("deserializer_opencl", ub_host, ub_ref); + EXPECT_NEAR(lp_ref, lp, 1e-8); +} + +TEST(deserializer_opencl_constraints, read_lub) { + std::vector theta_i; + std::vector sizes{4}; + Eigen::VectorXd params = make_params(sizes); + auto pack = pack_opencl_values(params, sizes); + + std::vector params_vec(params.data(), + params.data() + params.size()); + stan::io::deserializer cpu(params_vec, theta_i); + stan::io::deserializer> deserializer( + pack.values, theta_i, pack.align_elems); + + double lp = 0.0; + auto lub_cl + = deserializer.read_constrain_lub, true>( + -1.0, 1.0, lp, 4); + double lp_ref = 0.0; + auto lub_ref = stan::math::lub_constrain(cpu.read(4), + -1.0, 1.0, lp_ref); + Eigen::VectorXd lub_host + = stan::math::from_matrix_cl(lub_cl); + stan::test::expect_near_rel("deserializer_opencl", lub_host, lub_ref); + EXPECT_NEAR(lp_ref, lp, 1e-8); +} + +TEST(deserializer_opencl_constraints, read_offset_multiplier) { + std::vector theta_i; + std::vector sizes{3}; + Eigen::VectorXd params = make_params(sizes); + auto pack = pack_opencl_values(params, sizes); + + std::vector params_vec(params.data(), + params.data() + params.size()); + stan::io::deserializer cpu(params_vec, theta_i); + stan::io::deserializer> deserializer( + pack.values, theta_i, pack.align_elems); + + double lp = 0.0; + auto off_cl = deserializer + .read_constrain_offset_multiplier< + stan::math::matrix_cl, true>(1.5, 2.0, lp, 3); + double lp_ref = 0.0; + auto off_ref = stan::math::offset_multiplier_constrain( + cpu.read(3), 1.5, 2.0, lp_ref); + Eigen::VectorXd off_host + = stan::math::from_matrix_cl(off_cl); + stan::test::expect_near_rel("deserializer_opencl", off_host, off_ref); + EXPECT_NEAR(lp_ref, lp, 1e-8); +} + +TEST(deserializer_opencl_constraints, subbuffer_addition) { + std::vector theta_i; + std::vector sizes{4, 4}; + Eigen::VectorXd params = make_params(sizes); + auto pack = pack_opencl_values(params, sizes); + + std::vector params_vec(params.data(), + params.data() + params.size()); + stan::io::deserializer cpu(params_vec, theta_i); + stan::io::deserializer> deserializer( + pack.values, theta_i, pack.align_elems); + + auto a_cl = deserializer.read>(2, 2); + auto b_cl = deserializer.read>(2, 2); + stan::math::matrix_cl sum_cl = a_cl + b_cl; + + Eigen::MatrixXd a_ref = cpu.read(2, 2); + Eigen::MatrixXd b_ref = cpu.read(2, 2); + Eigen::MatrixXd sum_ref = a_ref + b_ref; + + Eigen::MatrixXd sum_host + = stan::math::from_matrix_cl(sum_cl); + stan::test::expect_near_rel("deserializer_opencl", sum_host, sum_ref); +} +#else +#include +TEST(deserializer_opencl_mixed, dummy) { EXPECT_NO_THROW(); } +TEST(deserializer_opencl_constraints, dummy) { EXPECT_NO_THROW(); } +#endif diff --git a/src/test/unit/io/deserializer_opencl_varmat_test.cpp b/src/test/unit/io/deserializer_opencl_varmat_test.cpp new file mode 100644 index 00000000000..11f268673b1 --- /dev/null +++ b/src/test/unit/io/deserializer_opencl_varmat_test.cpp @@ -0,0 +1,89 @@ +#ifdef STAN_OPENCL +#include +#include +#include +#include +#include + +TEST(deserializer_opencl_varmat, read_and_adj) { + std::vector theta_i; + Eigen::VectorXd params(6); + params << 1, 2, 3, 4, 5, 6; + + std::vector sizes{6}; + auto align_elems = stan::io::internal::align_elems_from_device(); + auto layout = stan::io::compute_serializer_layout(sizes, align_elems); + auto var_buf = stan::io::serialize_to_opencl(params, {}, sizes); + + stan::io::deserializer>> + deserializer(var_buf, theta_i, align_elems); + auto mat_var = deserializer.read>>(3, 2); + + Eigen::MatrixXd vals = stan::math::from_matrix_cl(mat_var.val()); + EXPECT_EQ(vals.rows(), 3); + EXPECT_EQ(vals.cols(), 2); + EXPECT_FLOAT_EQ(vals(0, 0), 1.0); + EXPECT_FLOAT_EQ(vals(1, 0), 2.0); + EXPECT_FLOAT_EQ(vals(2, 0), 3.0); + EXPECT_FLOAT_EQ(vals(0, 1), 4.0); + EXPECT_FLOAT_EQ(vals(1, 1), 5.0); + EXPECT_FLOAT_EQ(vals(2, 1), 6.0); + + mat_var.adj() = stan::math::constant(1.0, 3, 2); + mat_var.adj().wait_for_write_events(); + + Eigen::VectorXd full_adj + = stan::math::from_matrix_cl(var_buf.adj()); + ASSERT_EQ(full_adj.size(), static_cast(layout.total_size)); + for (int i = 0; i < 6; ++i) { + EXPECT_FLOAT_EQ(full_adj[i], 1.0); + } +} + +TEST(deserializer_opencl_varmat, multiple_blocks_and_padding) { + std::vector theta_i; + std::vector sizes{3, 5}; + auto align_elems = stan::io::internal::align_elems_from_device(); + auto layout = stan::io::compute_serializer_layout(sizes, align_elems); + + Eigen::VectorXd params(static_cast(sizes[0] + sizes[1])); + for (Eigen::Index i = 0; i < params.size(); ++i) { + params.coeffRef(i) = static_cast(i + 1); + } + + auto var_buf = stan::io::serialize_to_opencl(params, {}, sizes); + stan::io::deserializer>> + deserializer(var_buf, theta_i, align_elems); + + auto vec_var = deserializer.read>>(3); + auto row_var = deserializer.read>>(1, 5); + + vec_var.adj() = stan::math::constant(1.0, 3, 1); + row_var.adj() = stan::math::constant(2.0, 1, 5); + vec_var.adj().wait_for_write_events(); + row_var.adj().wait_for_write_events(); + + Eigen::VectorXd full_adj + = stan::math::from_matrix_cl(var_buf.adj()); + ASSERT_EQ(full_adj.size(), static_cast(layout.total_size)); + + std::vector expected(layout.total_size, 0.0); + for (size_t i = 0; i < sizes.size(); ++i) { + const size_t block_size = sizes[i]; + const size_t offset = layout.offsets[i]; + const double value = (i == 0) ? 1.0 : 2.0; + for (size_t j = 0; j < block_size; ++j) { + expected[offset + j] = value; + } + } + + for (size_t i = 0; i < expected.size(); ++i) { + EXPECT_FLOAT_EQ(expected[i], full_adj[static_cast(i)]); + } +} +#else +#include +TEST(deserializer_opencl_varmat, dummy) { EXPECT_NO_THROW(); } +#endif diff --git a/src/test/unit/io/opencl_subbuffer_ops_test.cpp b/src/test/unit/io/opencl_subbuffer_ops_test.cpp new file mode 100644 index 00000000000..1737cea4ce7 --- /dev/null +++ b/src/test/unit/io/opencl_subbuffer_ops_test.cpp @@ -0,0 +1,54 @@ +#ifdef STAN_OPENCL +#include +#include +#include +#include +#include + +TEST(opencl_subbuffer_ops, add_subbuffers) { + std::vector sizes{4, 4}; + const size_t align_elems = stan::io::internal::align_elems_from_device(); + const auto layout = stan::io::compute_serializer_layout(sizes, align_elems); + + Eigen::VectorXd params(8); + for (Eigen::Index i = 0; i < params.size(); ++i) { + params.coeffRef(i) = static_cast(i + 1); + } + + auto values = stan::io::allocate_serializer_buffer(layout, CL_MEM_READ_ONLY); + stan::io::copy_to_serialize_buffer(params, values, layout); + + cl::Buffer parent = values.buffer(); + cl_buffer_region region_a{layout.offsets[0] * sizeof(double), + sizes[0] * sizeof(double)}; + cl_buffer_region region_b{layout.offsets[1] * sizeof(double), + sizes[1] * sizeof(double)}; + + cl::Buffer sub_a = parent.createSubBuffer(CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion_a); + cl::Buffer sub_b = parent.createSubBuffer(CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion_b); + + stan::math::matrix_cl a(sub_a, 2, 2); + stan::math::matrix_cl b(sub_b, 2, 2); + + stan::math::matrix_cl sum = a + b; + Eigen::MatrixXd sum_host = stan::math::from_matrix_cl(sum); + + Eigen::Map> + a_ref(params.data(), 2, 2); + Eigen::Map> + b_ref(params.data() + 4, 2, 2); + + Eigen::MatrixXd expected = a_ref + b_ref; + stan::test::expect_near_rel("opencl_subbuffer_ops", sum_host, expected); +} + +#else +#include +TEST(opencl_subbuffer_ops, dummy) { EXPECT_NO_THROW(); } +#endif diff --git a/src/test/unit/io/serializer_opencl_layout_test.cpp b/src/test/unit/io/serializer_opencl_layout_test.cpp new file mode 100644 index 00000000000..79a1ef236cc --- /dev/null +++ b/src/test/unit/io/serializer_opencl_layout_test.cpp @@ -0,0 +1,181 @@ +#ifdef STAN_OPENCL +#include +#include +#include +#include +#include +#include +#include + +namespace { +Eigen::VectorXd make_params(const std::vector& sizes) { + const size_t total = std::accumulate(sizes.begin(), sizes.end(), size_t{0}); + Eigen::VectorXd params(static_cast(total)); + for (Eigen::Index i = 0; i < params.size(); ++i) { + params.coeffRef(i) = static_cast(i + 1); + } + return params; +} + +void expect_layout_matches(const Eigen::VectorXd& full, + const stan::io::serializer_layout& layout, + const Eigen::VectorXd& params) { + std::vector is_data(layout.total_size, 0); + size_t src_offset = 0; + for (size_t i = 0; i < layout.sizes.size(); ++i) { + const size_t offset = layout.offsets[i]; + const size_t block_size = layout.sizes[i]; + for (size_t j = 0; j < block_size; ++j) { + EXPECT_FLOAT_EQ(full[offset + j], params[src_offset + j]); + is_data[offset + j] = 1; + } + src_offset += block_size; + } + for (size_t i = 0; i < layout.total_size; ++i) { + if (!is_data[i]) { + EXPECT_FLOAT_EQ(full[static_cast(i)], 0.0); + } + } +} +} // namespace + +TEST(serializer_opencl_layout, compute_layout) { + std::vector sizes{1, 7, 3}; + auto layout = stan::io::compute_serializer_layout(sizes, 4); + ASSERT_EQ(layout.offsets.size(), sizes.size()); + EXPECT_EQ(layout.offsets[0], 0U); + EXPECT_EQ(layout.offsets[1], 4U); + EXPECT_EQ(layout.offsets[2], 12U); + EXPECT_EQ(layout.total_size, 15U); +} + +TEST(serializer_opencl_layout, compute_layout_no_padding) { + std::vector sizes{2, 3, 5}; + auto layout = stan::io::compute_serializer_layout(sizes, 1); + ASSERT_EQ(layout.offsets.size(), sizes.size()); + EXPECT_EQ(layout.offsets[0], 0U); + EXPECT_EQ(layout.offsets[1], 2U); + EXPECT_EQ(layout.offsets[2], 5U); + EXPECT_EQ(layout.total_size, 10U); +} + +TEST(serializer_opencl_layout, allocate_empty_buffer) { + std::vector sizes; + auto layout = stan::io::compute_serializer_layout(sizes, 4); + auto values = stan::io::allocate_serializer_buffer(layout, CL_MEM_READ_ONLY); + EXPECT_EQ(values.size(), 0); + EXPECT_EQ(values.rows(), 0); + EXPECT_EQ(values.cols(), 0); +} + +TEST(serializer_opencl_layout, allocate_buffer_shape) { + std::vector sizes{3}; + auto layout = stan::io::compute_serializer_layout(sizes, 4); + auto values = stan::io::allocate_serializer_buffer(layout, CL_MEM_READ_ONLY); + EXPECT_EQ(values.rows(), static_cast(layout.total_size)); + EXPECT_EQ(values.cols(), 1); +} + +TEST(serializer_opencl_layout, copy_to_buffer) { + std::vector sizes{2, 3}; + auto layout = stan::io::compute_serializer_layout(sizes, 4); + stan::math::matrix_cl values + = stan::io::allocate_serializer_buffer(layout, CL_MEM_READ_ONLY); + + if (layout.total_size > 0) { + auto& queue = stan::math::opencl_context.queue(); + double zero = 0.0; + queue.enqueueFillBuffer(values.buffer(), zero, 0, + sizeof(double) * layout.total_size); + queue.finish(); + } + + Eigen::VectorXd params(5); + params << 1.0, 2.0, 3.0, 4.0, 5.0; + stan::io::copy_to_serialize_buffer(params, values, layout); + + Eigen::VectorXd full = stan::math::from_matrix_cl(values); + ASSERT_EQ(full.size(), 7); + EXPECT_FLOAT_EQ(full[0], 1.0); + EXPECT_FLOAT_EQ(full[1], 2.0); + EXPECT_FLOAT_EQ(full[2], 0.0); + EXPECT_FLOAT_EQ(full[3], 0.0); + EXPECT_FLOAT_EQ(full[4], 3.0); + EXPECT_FLOAT_EQ(full[5], 4.0); + EXPECT_FLOAT_EQ(full[6], 5.0); +} + +TEST(serializer_opencl_layout, copy_to_buffer_many_blocks) { + std::vector sizes{1, 6, 1, 4, 8}; + auto layout = stan::io::compute_serializer_layout(sizes, 4); + auto values = stan::io::allocate_serializer_buffer(layout, CL_MEM_READ_ONLY); + + if (layout.total_size > 0) { + auto& queue = stan::math::opencl_context.queue(); + double zero = 0.0; + queue.enqueueFillBuffer(values.buffer(), zero, 0, + sizeof(double) * layout.total_size); + queue.finish(); + } + + Eigen::VectorXd params = make_params(sizes); + stan::io::copy_to_serialize_buffer(params, values, layout); + + Eigen::VectorXd full = stan::math::from_matrix_cl(values); + ASSERT_EQ(full.size(), static_cast(layout.total_size)); + expect_layout_matches(full, layout, params); +} + +TEST(serializer_opencl_layout, serialize_to_opencl_roundtrip) { + std::vector sizes{1, 2, 3, 4, 4, 12}; + Eigen::VectorXd params = make_params(sizes); + std::vector> dimss; + auto params_opencl = stan::io::serialize_to_opencl(params, dimss, sizes); + + auto align_elems = stan::io::internal::align_elems_from_device(); + auto layout = stan::io::compute_serializer_layout(sizes, align_elems); + Eigen::VectorXd full_vals + = stan::math::from_matrix_cl(params_opencl.val()); + ASSERT_EQ(full_vals.size(), static_cast(layout.total_size)); + expect_layout_matches(full_vals, layout, params); + + EXPECT_EQ(params_opencl.adj().rows(), + static_cast(layout.total_size)); + EXPECT_EQ(params_opencl.adj().cols(), 1); + stan::math::recover_memory(); +} + +TEST(serializer_opencl_layout, stdvector_dimensions_match_cpu_shapes) { + std::vector sizes{ + 10, 10, // std::vector(2, 10) + 20, 20, // std::vector(2, 10) -> 2 scalars per element + 10, 10, // std::vector(2, 10) + 20, 20, // std::vector(2, 10) + 16, 16, // std::vector(2, 4x4) + 32, 32, // std::vector(2, 4x4) + 10, 10, // std::vector>(2, 10) + 20, 20 // std::vector>(2, 10) + }; + + auto align_elems = stan::io::internal::align_elems_from_device(); + auto layout = stan::io::compute_serializer_layout(sizes, align_elems); + auto values = stan::io::allocate_serializer_buffer(layout, CL_MEM_READ_ONLY); + + if (layout.total_size > 0) { + auto& queue = stan::math::opencl_context.queue(); + double zero = 0.0; + queue.enqueueFillBuffer(values.buffer(), zero, 0, + sizeof(double) * layout.total_size); + queue.finish(); + } + + Eigen::VectorXd params = make_params(sizes); + stan::io::copy_to_serialize_buffer(params, values, layout); + Eigen::VectorXd full = stan::math::from_matrix_cl(values); + ASSERT_EQ(full.size(), static_cast(layout.total_size)); + expect_layout_matches(full, layout, params); +} +#else +#include +TEST(serializer_opencl_layout, dummy) { EXPECT_NO_THROW(); } +#endif diff --git a/src/test/unit/model/model_base_crtp_test.cpp b/src/test/unit/model/model_base_crtp_test.cpp index 6db95fa208c..5b2a7977cde 100644 --- a/src/test/unit/model/model_base_crtp_test.cpp +++ b/src/test/unit/model/model_base_crtp_test.cpp @@ -1,6 +1,9 @@ #include #include #include +#ifdef STAN_OPENCL +#include +#endif #include #include #include @@ -57,6 +60,19 @@ struct mock_model : public stan::model::model_base_crtp { } } +#ifdef STAN_OPENCL + stan::math::var log_prob(stan::math::matrix_cl& params_r, + std::ostream* msgs) const override { + return 9; + } + + stan::math::var log_prob( + stan::math::var_value>& params_r, + std::ostream* msgs) const override { + return 10; + } +#endif + void transform_inits(const stan::io::var_context& context, Eigen::VectorXd& params_r, std::ostream* msgs) const override {} diff --git a/src/test/unit/model/model_base_test.cpp b/src/test/unit/model/model_base_test.cpp index c4a81f71323..2dc2af11c0e 100644 --- a/src/test/unit/model/model_base_test.cpp +++ b/src/test/unit/model/model_base_test.cpp @@ -1,5 +1,8 @@ #include #include +#ifdef STAN_OPENCL +#include +#endif #include #include #include @@ -42,6 +45,19 @@ struct mock_model : public stan::model::model_base { return 2; } +#ifdef STAN_OPENCL + stan::math::var log_prob(stan::math::matrix_cl& params_r, + std::ostream* msgs) const override { + return 9; + } + + stan::math::var log_prob( + stan::math::var_value>& params_r, + std::ostream* msgs) const override { + return 10; + } +#endif + double log_prob_jacobian(Eigen::VectorXd& params_r, std::ostream* msgs) const override { return 3; diff --git a/src/test/unit/model/opencl_log_prob_test.cpp b/src/test/unit/model/opencl_log_prob_test.cpp new file mode 100644 index 00000000000..c1c6e25e210 --- /dev/null +++ b/src/test/unit/model/opencl_log_prob_test.cpp @@ -0,0 +1,169 @@ +#ifdef STAN_OPENCL + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +class opencl_mock_model + : public stan::model::model_base_crtp { + public: + using var_matrix_cl_t + = stan::math::var_value>; + + opencl_mock_model(size_t a_size, size_t b_size) + : model_base_crtp(a_size + b_size), + a_size_(a_size), + b_size_(b_size) {} + + std::string model_name() const override { return "opencl_mock_model"; } + + std::vector model_compile_info() const override { return {}; } + + void get_param_names(std::vector& names, bool include_tparams, + bool include_gqs) const override { + names.clear(); + names.emplace_back("a"); + names.emplace_back("b"); + } + + void get_dims(std::vector>& dimss, bool include_tparams, + bool include_gqs) const override { + dimss.clear(); + dimss.emplace_back(std::vector{a_size_}); + dimss.emplace_back(std::vector{b_size_}); + } + + void constrained_param_names(std::vector& param_names, + bool include_tparams, + bool include_gqs) const override { + param_names.clear(); + } + + void unconstrained_param_names(std::vector& param_names, + bool include_tparams, + bool include_gqs) const override { + param_names.clear(); + } + + template + T log_prob(Eigen::Matrix& params_r, std::ostream* msgs) const { + std::vector params_i; + stan::io::deserializer in(params_r, params_i); + using vec_t = Eigen::Matrix; + vec_t a = in.template read(static_cast(a_size_)); + vec_t b = in.template read(static_cast(b_size_)); + return stan::math::dot_product(a, a) + stan::math::dot_product(b, b); + } + + template + T log_prob(std::vector& params_r, std::vector& params_i, + std::ostream* msgs) const { + return 0; + } + + stan::math::var log_prob(stan::math::matrix_cl& params_r, + std::ostream* msgs) const override { + std::vector params_i; + size_t align_elems = stan::io::internal::align_elems_from_device(); + stan::io::deserializer> in( + params_r, params_i, align_elems); + auto a = in.template read>( + static_cast(a_size_)); + auto b = in.template read>( + static_cast(b_size_)); + double lp = stan::math::dot_product(a, a) + stan::math::dot_product(b, b); + return stan::math::var(lp); + } + + stan::math::var log_prob( + stan::math::var_value>& params_r, + std::ostream* msgs) const override { + std::vector params_i; + size_t align_elems = stan::io::internal::align_elems_from_device(); + stan::io::deserializer in(params_r, params_i, align_elems); + auto a = in.template read( + static_cast(a_size_)); + auto b = in.template read( + static_cast(b_size_)); + return stan::math::dot_product(a, a) + stan::math::dot_product(b, b); + } + + void transform_inits(const stan::io::var_context& context, + Eigen::VectorXd& params_r, + std::ostream* msgs) const override {} + + template + void write_array(RNG& base_rng, Eigen::VectorXd& params_r, + Eigen::VectorXd& params_constrained_r, bool include_tparams, + bool include_gqs, std::ostream* msgs) const {} + + void unconstrain_array(const Eigen::VectorXd& params_constrained_r, + Eigen::VectorXd& params_r, + std::ostream* msgs = nullptr) const override {} + + void transform_inits(const stan::io::var_context& context, + std::vector& params_i, + std::vector& params_r, + std::ostream* msgs) const override {} + + template + void write_array(RNG& base_rng, std::vector& params_r, + std::vector& params_i, + std::vector& params_r_constrained, + bool include_tparams, bool include_gqs, + std::ostream* msgs) const {} + + void unconstrain_array(const std::vector& params_constrained_r, + std::vector& params_r, + std::ostream* msgs = nullptr) const override {} + + private: + size_t a_size_; + size_t b_size_; +}; + +} // namespace + +TEST(model, openclLogProbMatchesCpu) { + size_t align_elems = stan::io::internal::align_elems_from_device(); + size_t a_size = align_elems > 1 ? align_elems - 1 : 2; + size_t b_size = 3; + opencl_mock_model model(a_size, b_size); + + Eigen::VectorXd params(static_cast(a_size + b_size)); + for (Eigen::Index i = 0; i < params.size(); ++i) { + params.coeffRef(i) = static_cast(i + 1); + } + + double expected = params.squaredNorm(); + + std::vector> dimss; + std::vector sizes{a_size, b_size}; + auto params_opencl = stan::io::serialize_to_opencl(params, dimss, sizes); + + auto lp_opencl = model.log_prob(params_opencl, nullptr); + EXPECT_NEAR(expected, lp_opencl.val(), 1e-12); + + stan::math::matrix_cl params_vals = params_opencl.val(); + auto lp_opencl_prim = model.log_prob(params_vals, nullptr); + EXPECT_NEAR(expected, lp_opencl_prim.val(), 1e-12); + + stan::math::recover_memory(); +} + +#else + +#include +TEST(model, openclLogProbDummy) { EXPECT_NO_THROW(); } + +#endif diff --git a/src/test/unit/services/util/mcmc_writer_test.cpp b/src/test/unit/services/util/mcmc_writer_test.cpp index c9840a68806..92afc6bae81 100644 --- a/src/test/unit/services/util/mcmc_writer_test.cpp +++ b/src/test/unit/services/util/mcmc_writer_test.cpp @@ -7,6 +7,9 @@ #include #include #include +#ifdef STAN_OPENCL +#include +#endif namespace test { // mock_throwing_model_in_write_array throws exception in the write_array() @@ -39,6 +42,20 @@ class throwing_model : public stan::model::model_base_crtp { return 0.0; } // log_prob() +#ifdef STAN_OPENCL + inline stan::math::var log_prob(stan::math::matrix_cl& params_r, + std::ostream* pstream__ = nullptr) const + override { + return 0.0; + } + + inline stan::math::var log_prob( + stan::math::var_value>& params_r, + std::ostream* pstream__ = nullptr) const override { + return 0.0; + } +#endif + template void write_array(RNG& base_rng__, std::vector& params_r__, std::vector& params_i__, std::vector& vars__,