diff --git a/src/stan/io/opencl/deserializer.hpp b/src/stan/io/opencl/deserializer.hpp
new file mode 100644
index 00000000000..ad3ebd62d5f
--- /dev/null
+++ b/src/stan/io/opencl/deserializer.hpp
@@ -0,0 +1,692 @@
+#ifndef STAN_IO_OPENCL_DESERIALIZER_HPP
+#define STAN_IO_OPENCL_DESERIALIZER_HPP
+
+#ifdef STAN_OPENCL
+
+#include <stan/io/deserializer.hpp>
+#include <stan/io/opencl/utils.hpp>
+#include <stan/math/opencl/copy.hpp>
+#include <stan/math/opencl/matrix_cl.hpp>
+#include <stan/math/opencl/err/check_opencl.hpp>
+#include <stan/math/opencl/rev/vari.hpp>
+#include <stan/math/opencl/prim/constraint/lb_constrain.hpp>
+#include <stan/math/opencl/prim/constraint/ub_constrain.hpp>
+#include <stan/math/opencl/prim/constraint/lub_constrain.hpp>
+#include <stan/math/opencl/prim/constraint/offset_multiplier_constrain.hpp>
+#include <stan/math/opencl/prim/constraint/unit_vector_constrain.hpp>
+#include <stan/math/opencl/rev/constraint/lb_constrain.hpp>
+#include <stan/math/opencl/rev/constraint/ub_constrain.hpp>
+#include <stan/math/opencl/rev/constraint/lub_constrain.hpp>
+#include <stan/math/opencl/rev/constraint/offset_multiplier_constrain.hpp>
+#include <stan/math/opencl/rev/constraint/unit_vector_constrain.hpp>
+#include <stan/math/prim/meta.hpp>
+#include <stan/math/rev/core.hpp>
+
+#include <Eigen/Dense>
+#include <CL/opencl.hpp>
+
+#include <complex>
+#include <vector>
+#include <type_traits>
+
+namespace stan {
+namespace io {
+
+/**
+ * OpenCL deserializer specialization for matrix_cl buffers (prim).
+ */
+template <>
+class deserializer<stan::math::matrix_cl<double>> {
+ using mat_t = stan::math::matrix_cl<double>;
+ private:
+  const mat_t& data_r_;
+  Eigen::Map<const Eigen::Matrix<int, -1, 1>> map_i_;
+  size_t r_size_{0};
+  size_t i_size_{0};
+  size_t pos_r_{0};
+  size_t pos_i_{0};
+  size_t align_elems_{1};
+
+  /**
+   * Check there are at least m reals left to read.
+   *
+   * @param m Number of real elements to read.
+   * @throws std::runtime_error if there are insufficient elements.
+   */
+  void check_r_capacity(size_t m) const {
+    STAN_NO_RANGE_CHECKS_RETURN;
+    if (pos_r_ + m > r_size_) {
+      []() STAN_COLD_PATH {
+        throw std::runtime_error("no more scalars to read");
+      }();
+    }
+  }
+
+  /**
+   * Check there are at least m integers left to read.
+   *
+   * @param m Number of integer elements to read.
+   * @throws std::runtime_error if there are insufficient elements.
+   */
+  void check_i_capacity(size_t m) const {
+    STAN_NO_RANGE_CHECKS_RETURN;
+    if (pos_i_ + m > i_size_) {
+      []() STAN_COLD_PATH {
+        throw std::runtime_error("no more integers to read");
+      }();
+    }
+  }
+
+  /**
+   * Align the real position to the next aligned element offset.
+   */
+  inline void align_pos() { pos_r_ = internal::round_up(pos_r_, align_elems_); }
+
+  /**
+   * Read a block as a matrix_cl subbuffer.
+   *
+   * @param size Block size in elements.
+   * @param rows Number of rows.
+   * @param cols Number of cols.
+   * @return matrix_cl wrapping the subbuffer.
+   * @throws std::runtime_error if there are insufficient elements.
+   * @throws cl::Error if subbuffer creation fails.
+   */
+  inline mat_t read_matrix_cl_(size_t size, int rows,
+                                                       int cols) {
+    align_pos();
+    if (size == 0) {
+      return mat_t(rows, cols);
+    }
+    check_r_capacity(size);
+    const size_t origin_bytes = pos_r_ * sizeof(double);
+    const size_t size_bytes = size * sizeof(double);
+    cl_buffer_region region{origin_bytes, size_bytes};
+    cl::Buffer parent = data_r_.buffer();
+    cl::Buffer sub = parent.createSubBuffer(
+        CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region);
+    pos_r_ += size;
+    align_pos();
+    return mat_t(sub, rows, cols);
+  }
+
+ public:
+  /**
+   * Construct a deserializer over host buffers.
+   *
+   * @tparam RVec real data vector type.
+   * @tparam IntVec integer data vector type.
+   * @param data_r Real data.
+   * @param data_i Integer data.
+   */
+  template <typename RVec, typename IntVec,
+            require_all_vector_like_t<RVec, IntVec>* = nullptr>
+  deserializer(const RVec& data_r, const IntVec& data_i)
+      : data_r_(data_r),
+        map_i_(data_i.data(), data_i.size()),
+        r_size_(data_r.size()),
+        i_size_(data_i.size()) {}
+
+  /**
+   * Construct a deserializer over an OpenCL buffer with alignment.
+   *
+   * @tparam IntVec integer data vector type.
+   * @param data_r Device buffer of reals.
+   * @param data_i Integer data.
+   * @param align_elems Alignment in elements.
+   */
+  template <typename IntVec, require_vector_like_t<IntVec>* = nullptr>
+  deserializer(const mat_t& data_r, const IntVec& data_i,
+               size_t align_elems)
+      : data_r_(data_r),
+        map_i_(data_i.data(), data_i.size()),
+        r_size_(data_r.size()),
+        i_size_(data_i.size()),
+        align_elems_(std::max<size_t>(1, align_elems)) {}
+
+  /**
+   * @return Number of remaining real elements.
+   */
+  inline size_t available() const noexcept { return r_size_ - pos_r_; }
+  /**
+   * @return Number of remaining integer elements.
+   */
+  inline size_t available_i() const noexcept { return i_size_ - pos_i_; }
+
+  /**
+   * Read a scalar floating-point value.
+   *
+   * @tparam Ret Floating point type.
+   * @return Scalar value.
+   * @throws std::runtime_error if there are insufficient elements.
+   * @throws cl::Error if subbuffer creation fails.
+   */
+  template <typename Ret,
+            require_t<std::is_floating_point<Ret>>* = nullptr>
+  inline Ret read() {
+    auto cl_val = read_matrix_cl_(1, 1, 1);
+    return stan::math::from_matrix_cl<Ret>(cl_val);
+  }
+
+  /**
+   * Read a complex scalar (two consecutive reals).
+   *
+   * @tparam Ret Complex type.
+   * @return Complex value.
+   * @throws std::runtime_error if there are insufficient elements.
+   * @throws cl::Error if subbuffer creation fails.
+   */
+  template <typename Ret, require_complex_t<Ret>* = nullptr>
+  inline Ret read() {
+    auto real = this->read<double>();
+    auto imag = this->read<double>();
+    return std::complex<double>{real, imag};
+  }
+
+  /**
+   * Read an integer value.
+   *
+   * @tparam Ret Integral type.
+   * @return Integer value.
+   * @throws std::runtime_error if there are insufficient elements.
+   */
+  template <typename Ret, require_integral_t<Ret>* = nullptr>
+  inline Ret read() {
+    check_i_capacity(1);
+    return map_i_.coeffRef(pos_i_++);
+  }
+
+  /**
+   * Read a matrix_cl with the given dimensions.
+   *
+   * @tparam Ret matrix_cl type.
+   * @param rows Rows.
+   * @param cols Cols.
+   * @return matrix_cl view of the subbuffer.
+   * @throws std::runtime_error if there are insufficient elements.
+   * @throws cl::Error if subbuffer creation fails.
+   */
+  template <typename Ret, require_matrix_cl_t<Ret>* = nullptr>
+  inline Ret read(Eigen::Index rows, Eigen::Index cols) {
+    return read_matrix_cl_(static_cast<size_t>(rows * cols),
+                           static_cast<int>(rows), static_cast<int>(cols));
+  }
+
+  /**
+   * Read a vector (matrix_cl with cols=1).
+   *
+   * @tparam Ret matrix_cl type.
+   * @param m Length.
+   * @return matrix_cl view of the subbuffer.
+   * @throws std::runtime_error if there are insufficient elements.
+   * @throws cl::Error if subbuffer creation fails.
+   */
+  template <typename Ret, require_matrix_cl_t<Ret>* = nullptr>
+  inline Ret read(Eigen::Index m) {
+    return read_matrix_cl_(static_cast<size_t>(m), static_cast<int>(m), 1);
+  }
+
+  /**
+   * Read a std::vector of elements.
+   *
+   * @tparam Ret std::vector type.
+   * @param m Vector length.
+   * @param dims Dimensions for each element.
+   * @return std::vector of deserialized elements.
+   * @throws std::runtime_error if there are insufficient elements.
+   * @throws cl::Error if subbuffer creation fails.
+   */
+  template <typename Ret, typename... Sizes,
+            require_std_vector_t<Ret>* = nullptr>
+  inline auto read(Eigen::Index m, Sizes... dims) {
+    std::decay_t<Ret> ret_vec;
+    if (unlikely(m == 0)) {
+      return ret_vec;
+    }
+    ret_vec.reserve(m);
+    for (size_t i = 0; i < static_cast<size_t>(m); ++i) {
+      ret_vec.emplace_back(this->read<value_type_t<Ret>>(dims...));
+    }
+    return ret_vec;
+  }
+
+  /**
+   * Read with lower-bound constraint.
+   *
+   * @tparam Ret Return type.
+   * @tparam Jacobian Whether to include Jacobian.
+   * @tparam LB Lower bound type.
+   * @tparam LP Log probability accumulator type.
+   * @param lb Lower bound.
+   * @param lp Log probability accumulator.
+   * @param sizes Dimensions for the read.
+   * @return Constrained value.
+   * @throws std::runtime_error if there are insufficient elements.
+   * @throws cl::Error if subbuffer creation fails.
+   */
+  template <typename Ret, bool Jacobian, typename LB, typename LP,
+            typename... Sizes>
+  inline auto read_constrain_lb(const LB& lb, LP& lp, Sizes... sizes) {
+    return stan::math::lb_constrain<Jacobian>(this->read<Ret>(sizes...), lb, lp);
+  }
+
+  /**
+   * Read with upper-bound constraint.
+   *
+   * @tparam Ret Return type.
+   * @tparam Jacobian Whether to include Jacobian.
+   * @tparam UB Upper bound type.
+   * @tparam LP Log probability accumulator type.
+   * @param ub Upper bound.
+   * @param lp Log probability accumulator.
+   * @param sizes Dimensions for the read.
+   * @return Constrained value.
+   * @throws std::runtime_error if there are insufficient elements.
+   * @throws cl::Error if subbuffer creation fails.
+   */
+  template <typename Ret, bool Jacobian, typename UB, typename LP,
+            typename... Sizes>
+  inline auto read_constrain_ub(const UB& ub, LP& lp, Sizes... sizes) {
+    return stan::math::ub_constrain<Jacobian>(this->read<Ret>(sizes...), ub, lp);
+  }
+
+  /**
+   * Read with lower/upper-bound constraint.
+   *
+   * @tparam Ret Return type.
+   * @tparam Jacobian Whether to include Jacobian.
+   * @tparam LB Lower bound type.
+   * @tparam UB Upper bound type.
+   * @tparam LP Log probability accumulator type.
+   * @param lb Lower bound.
+   * @param ub Upper bound.
+   * @param lp Log probability accumulator.
+   * @param sizes Dimensions for the read.
+   * @return Constrained value.
+   * @throws std::runtime_error if there are insufficient elements.
+   * @throws cl::Error if subbuffer creation fails.
+   */
+  template <typename Ret, bool Jacobian, typename LB, typename UB, typename LP,
+            typename... Sizes>
+  inline auto read_constrain_lub(const LB& lb, const UB& ub, LP& lp,
+                                 Sizes... sizes) {
+    return stan::math::lub_constrain<Jacobian>(this->read<Ret>(sizes...), lb, ub,
+                                               lp);
+  }
+
+  /**
+   * Read with offset-multiplier constraint.
+   *
+   * @tparam Ret Return type.
+   * @tparam Jacobian Whether to include Jacobian.
+   * @tparam M Offset type.
+   * @tparam S Multiplier type.
+   * @tparam LP Log probability accumulator type.
+   * @param mu Offset.
+   * @param sigma Multiplier.
+   * @param lp Log probability accumulator.
+   * @param sizes Dimensions for the read.
+   * @return Constrained value.
+   * @throws std::runtime_error if there are insufficient elements.
+   * @throws cl::Error if subbuffer creation fails.
+   */
+  template <typename Ret, bool Jacobian, typename M, typename S, typename LP,
+            typename... Sizes>
+  inline auto read_constrain_offset_multiplier(const M& mu, const S& sigma,
+                                               LP& lp, Sizes... sizes) {
+    return stan::math::offset_multiplier_constrain<Jacobian>(
+        this->read<Ret>(sizes...), mu, sigma, lp);
+  }
+
+  /**
+   * Read with unit-vector constraint.
+   *
+   * @tparam Ret Return type.
+   * @tparam Jacobian Whether to include Jacobian.
+   * @tparam LP Log probability accumulator type.
+   * @param lp Log probability accumulator.
+   * @param sizes Dimensions for the read.
+   * @return Constrained value.
+   * @throws std::runtime_error if there are insufficient elements.
+   * @throws cl::Error if subbuffer creation fails.
+   */
+  template <typename Ret, bool Jacobian, typename LP, typename... Sizes>
+  inline auto read_constrain_unit_vector(LP& lp, Sizes... sizes) {
+    return stan::math::unit_vector_constrain<Jacobian>(this->read<Ret>(sizes...),
+                                                       lp);
+  }
+};
+
+/**
+ * OpenCL deserializer specialization for var_value<matrix_cl> buffers (rev).
+ */
+template <>
+class deserializer<stan::math::var_value<stan::math::matrix_cl<double>>> {
+ using mat_t = stan::math::matrix_cl<double>;
+ private:
+  std::reference_wrapper<mat_t> val_;
+  std::reference_wrapper<mat_t> adj_;
+  Eigen::Map<const Eigen::Matrix<int, -1, 1>> map_i_;
+  size_t r_size_{0};
+  size_t i_size_{0};
+  size_t pos_r_{0};
+  size_t pos_i_{0};
+  size_t align_elems_{1};
+
+  /**
+   * Check there are at least m reals left to read.
+   *
+   * @param m Number of real elements to read.
+   * @throws std::runtime_error if there are insufficient elements.
+   */
+  void check_r_capacity(size_t m) const {
+    STAN_NO_RANGE_CHECKS_RETURN;
+    if (pos_r_ + m > r_size_) {
+      []() STAN_COLD_PATH {
+        throw std::runtime_error("no more scalars to read");
+      }();
+    }
+  }
+
+  /**
+   * Check there are at least m integers left to read.
+   *
+   * @param m Number of integer elements to read.
+   * @throws std::runtime_error if there are insufficient elements.
+   */
+  void check_i_capacity(size_t m) const {
+    STAN_NO_RANGE_CHECKS_RETURN;
+    if (pos_i_ + m > i_size_) {
+      []() STAN_COLD_PATH {
+        throw std::runtime_error("no more integers to read");
+      }();
+    }
+  }
+
+  /**
+   * Align the real position to the next aligned element offset.
+   */
+  inline void align_pos() { pos_r_ = internal::round_up(pos_r_, align_elems_); }
+
+  /**
+   * Read a block as a var_value<matrix_cl> subbuffer.
+   *
+   * @param size Block size in elements.
+   * @param rows Number of rows.
+   * @param cols Number of cols.
+   * @return var_value wrapping value and adjoint subbuffers.
+   * @throws std::runtime_error if there are insufficient elements.
+   * @throws cl::Error if subbuffer creation fails.
+   */
+  inline stan::math::var_value<mat_t>
+  read_var_matrix_cl_(size_t size, int rows, int cols) {
+    align_pos();
+    if (size == 0) {
+      mat_t empty_val(rows, cols);
+      mat_t empty_adj(rows, cols);
+      auto* vi = new stan::math::vari_value<mat_t>(
+          std::move(empty_val), std::move(empty_adj));
+      return stan::math::var_value<mat_t>(vi);
+    }
+    check_r_capacity(size);
+    const size_t origin_bytes = pos_r_ * sizeof(double);
+    const size_t size_bytes = size * sizeof(double);
+    cl_buffer_region region{origin_bytes, size_bytes};
+    cl::Buffer sub_val = val_.get().buffer().createSubBuffer(
+        CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region);
+    cl::Buffer sub_adj = adj_.get().buffer().createSubBuffer(
+        CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region);
+    pos_r_ += size;
+    align_pos();
+    mat_t val_mat(std::move(sub_val), rows, cols);
+    mat_t adj_mat(std::move(sub_adj), rows, cols);
+    auto* vi = new stan::math::vari_value<mat_t>(
+        std::move(val_mat), std::move(adj_mat));
+    return stan::math::var_value<mat_t>(vi);
+  }
+
+ public:
+  /**
+   * Construct a deserializer over a var_value<matrix_cl> buffer.
+   *
+   * @tparam IntVec integer data vector type.
+   * @param data_r Device buffer with values and adjoints.
+   * @param data_i Integer data.
+   * @param align_elems Alignment in elements.
+   */
+  template <typename IntVec, require_vector_like_t<IntVec>* = nullptr>
+  deserializer(const stan::math::var_value<mat_t>& data_r,
+               const IntVec& data_i, size_t align_elems)
+      : val_(data_r.val()),
+        adj_(data_r.adj()),
+        map_i_(data_i.data(), data_i.size()),
+        r_size_(data_r.val().size()),
+        i_size_(data_i.size()),
+        align_elems_(std::max<size_t>(1, align_elems)) {}
+
+  /**
+   * @return Number of remaining real elements.
+   */
+  inline size_t available() const noexcept { return r_size_ - pos_r_; }
+  /**
+   * @return Number of remaining integer elements.
+   */
+  inline size_t available_i() const noexcept { return i_size_ - pos_i_; }
+
+  /**
+   * Read a scalar floating-point value.
+   *
+   * @tparam Ret Floating point type.
+   * @return Scalar value.
+   * @throws std::runtime_error if there are insufficient elements.
+   * @throws cl::Error if subbuffer creation fails.
+   */
+  template <typename Ret,
+            require_t<std::is_floating_point<Ret>>* = nullptr>
+  inline Ret read() {
+    auto cl_val = read_var_matrix_cl_(1, 1, 1).val();
+    return stan::math::from_matrix_cl<Ret>(cl_val);
+  }
+
+  /**
+   * Read a complex scalar (two consecutive reals).
+   *
+   * @tparam Ret Complex type.
+   * @return Complex value.
+   * @throws std::runtime_error if there are insufficient elements.
+   * @throws cl::Error if subbuffer creation fails.
+   */
+  template <typename Ret, require_complex_t<Ret>* = nullptr>
+  inline Ret read() {
+    auto real = this->read<double>();
+    auto imag = this->read<double>();
+    return std::complex<double>{real, imag};
+  }
+
+  /**
+   * Read an integer value.
+   *
+   * @tparam Ret Integral type.
+   * @return Integer value.
+   * @throws std::runtime_error if there are insufficient elements.
+   */
+  template <typename Ret, require_integral_t<Ret>* = nullptr>
+  inline Ret read() {
+    check_i_capacity(1);
+    return map_i_.coeffRef(pos_i_++);
+  }
+
+  /**
+   * Read a var_value<matrix_cl> with the given dimensions.
+   *
+   * @tparam Ret var_value<matrix_cl> type.
+   * @param rows Rows.
+   * @param cols Cols.
+   * @return var_value view of the subbuffers.
+   * @throws std::runtime_error if there are insufficient elements.
+   * @throws cl::Error if subbuffer creation fails.
+   */
+  template <typename Ret,
+            require_t<std::is_same<
+                Ret,
+                stan::math::var_value<mat_t>>>* = nullptr>
+  inline Ret read(Eigen::Index rows, Eigen::Index cols) {
+    return read_var_matrix_cl_(static_cast<size_t>(rows * cols),
+                               static_cast<int>(rows),
+                               static_cast<int>(cols));
+  }
+
+  /**
+   * Read a vector as var_value<matrix_cl> (cols=1).
+   *
+   * @tparam Ret var_value<matrix_cl> type.
+   * @param m Length.
+   * @return var_value view of the subbuffers.
+   * @throws std::runtime_error if there are insufficient elements.
+   * @throws cl::Error if subbuffer creation fails.
+   */
+  template <typename Ret,
+            require_t<std::is_same<
+                Ret,
+                stan::math::var_value<mat_t>>>* = nullptr>
+  inline Ret read(Eigen::Index m) {
+    return read_var_matrix_cl_(static_cast<size_t>(m), static_cast<int>(m), 1);
+  }
+
+  /**
+   * Read a std::vector of elements.
+   *
+   * @tparam Ret std::vector type.
+   * @param m Vector length.
+   * @param dims Dimensions for each element.
+   * @return std::vector of deserialized elements.
+   * @throws std::runtime_error if there are insufficient elements.
+   * @throws cl::Error if subbuffer creation fails.
+   */
+  template <typename Ret, typename... Sizes,
+            require_std_vector_t<Ret>* = nullptr>
+  inline auto read(Eigen::Index m, Sizes... dims) {
+    std::decay_t<Ret> ret_vec;
+    if (unlikely(m == 0)) {
+      return ret_vec;
+    }
+    ret_vec.reserve(m);
+    for (size_t i = 0; i < static_cast<size_t>(m); ++i) {
+      ret_vec.emplace_back(this->read<value_type_t<Ret>>(dims...));
+    }
+    return ret_vec;
+  }
+
+  /**
+   * Read with lower-bound constraint.
+   *
+   * @tparam Ret Return type.
+   * @tparam Jacobian Whether to include Jacobian.
+   * @tparam LB Lower bound type.
+   * @tparam LP Log probability accumulator type.
+   * @param lb Lower bound.
+   * @param lp Log probability accumulator.
+   * @param sizes Dimensions for the read.
+   * @return Constrained value.
+   * @throws std::runtime_error if there are insufficient elements.
+   * @throws cl::Error if subbuffer creation fails.
+   */
+  template <typename Ret, bool Jacobian, typename LB, typename LP,
+            typename... Sizes>
+  inline auto read_constrain_lb(const LB& lb, LP& lp, Sizes... sizes) {
+    return stan::math::lb_constrain<Jacobian>(this->read<Ret>(sizes...), lb, lp);
+  }
+
+  /**
+   * Read with upper-bound constraint.
+   *
+   * @tparam Ret Return type.
+   * @tparam Jacobian Whether to include Jacobian.
+   * @tparam UB Upper bound type.
+   * @tparam LP Log probability accumulator type.
+   * @param ub Upper bound.
+   * @param lp Log probability accumulator.
+   * @param sizes Dimensions for the read.
+   * @return Constrained value.
+   * @throws std::runtime_error if there are insufficient elements.
+   * @throws cl::Error if subbuffer creation fails.
+   */
+  template <typename Ret, bool Jacobian, typename UB, typename LP,
+            typename... Sizes>
+  inline auto read_constrain_ub(const UB& ub, LP& lp, Sizes... sizes) {
+    return stan::math::ub_constrain<Jacobian>(this->read<Ret>(sizes...), ub, lp);
+  }
+
+  /**
+   * Read with lower/upper-bound constraint.
+   *
+   * @tparam Ret Return type.
+   * @tparam Jacobian Whether to include Jacobian.
+   * @tparam LB Lower bound type.
+   * @tparam UB Upper bound type.
+   * @tparam LP Log probability accumulator type.
+   * @param lb Lower bound.
+   * @param ub Upper bound.
+   * @param lp Log probability accumulator.
+   * @param sizes Dimensions for the read.
+   * @return Constrained value.
+   * @throws std::runtime_error if there are insufficient elements.
+   * @throws cl::Error if subbuffer creation fails.
+   */
+  template <typename Ret, bool Jacobian, typename LB, typename UB, typename LP,
+            typename... Sizes>
+  inline auto read_constrain_lub(const LB& lb, const UB& ub, LP& lp,
+                                 Sizes... sizes) {
+    return stan::math::lub_constrain<Jacobian>(this->read<Ret>(sizes...), lb, ub,
+                                               lp);
+  }
+
+  /**
+   * Read with offset-multiplier constraint.
+   *
+   * @tparam Ret Return type.
+   * @tparam Jacobian Whether to include Jacobian.
+   * @tparam M Offset type.
+   * @tparam S Multiplier type.
+   * @tparam LP Log probability accumulator type.
+   * @param mu Offset.
+   * @param sigma Multiplier.
+   * @param lp Log probability accumulator.
+   * @param sizes Dimensions for the read.
+   * @return Constrained value.
+   * @throws std::runtime_error if there are insufficient elements.
+   * @throws cl::Error if subbuffer creation fails.
+   */
+  template <typename Ret, bool Jacobian, typename M, typename S, typename LP,
+            typename... Sizes>
+  inline auto read_constrain_offset_multiplier(const M& mu, const S& sigma,
+                                               LP& lp, Sizes... sizes) {
+    return stan::math::offset_multiplier_constrain<Jacobian>(
+        this->read<Ret>(sizes...), mu, sigma, lp);
+  }
+
+  /**
+   * Read with unit-vector constraint.
+   *
+   * @tparam Ret Return type.
+   * @tparam Jacobian Whether to include Jacobian.
+   * @tparam LP Log probability accumulator type.
+   * @param lp Log probability accumulator.
+   * @param sizes Dimensions for the read.
+   * @return Constrained value.
+   * @throws std::runtime_error if there are insufficient elements.
+   * @throws cl::Error if subbuffer creation fails.
+   */
+  template <typename Ret, bool Jacobian, typename LP, typename... Sizes>
+  inline auto read_constrain_unit_vector(LP& lp, Sizes... sizes) {
+    return stan::math::unit_vector_constrain<Jacobian>(this->read<Ret>(sizes...),
+                                                       lp);
+  }
+};
+
+}  // namespace io
+}  // namespace stan
+
+#endif  // STAN_OPENCL
+
+#endif  // STAN_IO_OPENCL_DESERIALIZER_HPP
diff --git a/src/stan/io/opencl/utils.hpp b/src/stan/io/opencl/utils.hpp
new file mode 100644
index 00000000000..1b36f43988a
--- /dev/null
+++ b/src/stan/io/opencl/utils.hpp
@@ -0,0 +1,231 @@
+#ifndef STAN_IO_OPENCL_UTILS_HPP
+#define STAN_IO_OPENCL_UTILS_HPP
+
+#ifdef STAN_OPENCL
+
+#include <stan/math/opencl/opencl_context.hpp>
+#include <stan/math/opencl/matrix_cl.hpp>
+#include <stan/math/opencl/err/check_opencl.hpp>
+#include <stan/math/opencl/rev/vari.hpp>
+#include <stan/math/rev/core/var.hpp>
+#include <stan/math/prim/err/check_size_match.hpp>
+#include <stan/math/prim/meta.hpp>
+#include <stan/math/prim/fun/Eigen.hpp>
+
+#include <CL/opencl.hpp>
+#include <Eigen/Dense>
+
+#include <numeric>
+#include <vector>
+#include <algorithm>
+
+namespace stan {
+namespace io {
+
+/**
+ * Layout information for aligned OpenCL serializer buffers.
+ *
+ * Offsets and sizes are in elements (not bytes).
+ */
+struct serializer_layout {
+  /** Aligned offsets in elements for each block. */
+  std::vector<size_t> offsets;
+  /** Block sizes in elements. */
+  std::vector<size_t> sizes;
+  /** Total size (including padding) in elements. */
+  size_t total_size{0};
+  /** Alignment in elements. */
+  size_t align_elems{1};
+};
+
+namespace internal {
+/**
+ * Round a value up to the next multiple.
+ *
+ * @param value Value to round up.
+ * @param multiple Alignment multiple.
+ * @return Rounded value (>= value).
+ */
+inline size_t round_up(size_t value, size_t multiple) {
+  if (multiple == 0) {
+    return value;
+  }
+  const size_t rem = value % multiple;
+  return rem == 0 ? value : value + (multiple - rem);
+}
+
+/**
+ * Query the device alignment and return required alignment in elements.
+ *
+ * @return Alignment in elements for double buffers.
+ * @throws cl::Error if OpenCL device queries fail.
+ */
+inline size_t align_elems_from_device() {
+  size_t align_bits
+      = stan::math::opencl_context.device()[0]
+            .getInfo<CL_DEVICE_MEM_BASE_ADDR_ALIGN>();
+  size_t align_bytes = (align_bits + 7) / 8;
+  if (align_bytes == 0) {
+    return 1;
+  }
+  const size_t elem_size = sizeof(double);
+  const size_t g = std::gcd(align_bytes, elem_size);
+  const size_t align_elems = align_bytes / (g == 0 ? 1 : g);
+  return align_elems == 0 ? 1 : align_elems;
+}
+}  // namespace internal
+
+/**
+ * Compute aligned offsets and total size for parameter blocks.
+ *
+ * @param sizes Block sizes in read order (elements).
+ * @param align_elems Alignment in elements.
+ * @return Layout with aligned offsets and total size.
+ */
+inline serializer_layout compute_serializer_layout(
+    const std::vector<size_t>& sizes, size_t align_elems) {
+  serializer_layout layout;
+  layout.sizes = sizes;
+  layout.align_elems = std::max<size_t>(1, align_elems);
+  layout.offsets.reserve(sizes.size());
+  size_t pos = 0;
+  for (size_t size : sizes) {
+    pos = internal::round_up(pos, layout.align_elems);
+    layout.offsets.push_back(pos);
+    pos += size;
+  }
+  layout.total_size = pos;
+  return layout;
+}
+
+/**
+ * Allocate a matrix_cl buffer sized to the serialized layout.
+ *
+ * @param layout Serializer layout.
+ * @param flags OpenCL memory flags.
+ * @return Device buffer with shape (layout.total_size, 1).
+ * @throws std::system_error if an OpenCL error occurs.
+ */
+inline stan::math::matrix_cl<double> allocate_serializer_buffer(
+    const serializer_layout& layout, cl_mem_flags flags) {
+  if (layout.total_size == 0) {
+    return stan::math::matrix_cl<double>();
+  }
+  cl::Context& ctx = stan::math::opencl_context.context();
+  try {
+    cl_mem_flags alloc_flags = flags;
+    if (stan::math::opencl_context.device()[0]
+            .getInfo<CL_DEVICE_HOST_UNIFIED_MEMORY>()) {
+      alloc_flags |= CL_MEM_ALLOC_HOST_PTR;
+    }
+    cl::Buffer buffer(ctx, alloc_flags, sizeof(double) * layout.total_size);
+    return stan::math::matrix_cl<double>(buffer, layout.total_size, 1);
+  } catch (const cl::Error& e) {
+    stan::math::check_opencl_error("allocate_serializer_buffer", e);
+  }
+  return stan::math::matrix_cl<double>();
+}
+
+/**
+ * Copy host parameters into aligned blocks of a device buffer.
+ *
+ * @param src Flat parameters with no padding.
+ * @param dst Device buffer to receive padded blocks.
+ * @param layout Serializer layout describing offsets and sizes.
+ * @throws std::invalid_argument if src size does not match sum of sizes.
+ * @throws std::system_error if an OpenCL error occurs.
+ */
+inline void copy_to_serialize_buffer(const Eigen::VectorXd& src,
+                                     stan::math::matrix_cl<double>& dst,
+                                     const serializer_layout& layout) {
+  const size_t total_src = static_cast<size_t>(src.size());
+  const size_t total_sizes
+      = std::accumulate(layout.sizes.begin(), layout.sizes.end(), size_t{0});
+  stan::math::check_size_match("copy_to_serialize_buffer", "src.size()",
+                               total_src, "sum(sizes)", total_sizes);
+  if (layout.total_size == 0) {
+    return;
+  }
+
+  auto& queue = stan::math::opencl_context.queue();
+  std::vector<cl::Event> events;
+  events.reserve(layout.sizes.size());
+  size_t src_offset = 0;
+
+  try {
+    for (size_t i = 0; i < layout.sizes.size(); ++i) {
+      const size_t block_size = layout.sizes[i];
+      if (block_size == 0) {
+        continue;
+      }
+      const size_t origin_bytes = layout.offsets[i] * sizeof(double);
+      const size_t size_bytes = block_size * sizeof(double);
+      cl_buffer_region region{origin_bytes, size_bytes};
+      cl::Buffer sub_dst = dst.buffer().createSubBuffer(
+          CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region);
+
+      cl::Event event;
+      queue.enqueueWriteBuffer(sub_dst, CL_FALSE, 0, size_bytes,
+                               src.data() + src_offset, nullptr, &event);
+      events.push_back(event);
+      src_offset += block_size;
+    }
+
+    for (cl::Event& e : events) {
+      e.wait();
+    }
+  } catch (const cl::Error& e) {
+    stan::math::check_opencl_error("copy_to_serialize_buffer", e);
+  }
+}
+
+/**
+ * Serialize host parameters into a padded OpenCL buffer.
+ *
+ * @param params Flat unconstrained parameters.
+ * @param dimss Parameter dimensions (unused, passed for API parity).
+ * @param sizes Unconstrained block sizes.
+ * @return var_value holding values and adjoints buffers.
+ * @throws std::system_error if an OpenCL error occurs.
+ * @throws std::invalid_argument if params size does not match sum of sizes.
+ */
+inline stan::math::var_value<stan::math::matrix_cl<double>> serialize_to_opencl(
+    const Eigen::VectorXd& params,
+    const std::vector<std::vector<size_t>>& dimss,
+    const std::vector<size_t>& sizes) {
+  (void)dimss;
+  const size_t align_elems = internal::align_elems_from_device();
+  const serializer_layout layout = compute_serializer_layout(sizes, align_elems);
+
+  stan::math::matrix_cl<double> values
+      = allocate_serializer_buffer(layout, CL_MEM_READ_ONLY);
+  stan::math::matrix_cl<double> adjoints
+      = allocate_serializer_buffer(layout, CL_MEM_READ_WRITE);
+
+  if (layout.total_size > 0) {
+    auto& queue = stan::math::opencl_context.queue();
+    try {
+      double zero = 0.0;
+      queue.enqueueFillBuffer(values.buffer(), zero, 0,
+                              sizeof(double) * layout.total_size);
+      queue.enqueueFillBuffer(adjoints.buffer(), zero, 0,
+                              sizeof(double) * layout.total_size);
+      queue.finish();
+    } catch (const cl::Error& e) {
+      stan::math::check_opencl_error("serialize_to_opencl", e);
+    }
+  }
+
+  copy_to_serialize_buffer(params, values, layout);
+
+  auto* vi = new stan::math::vari_value<stan::math::matrix_cl<double>>(
+      std::move(values), std::move(adjoints));
+  return stan::math::var_value<stan::math::matrix_cl<double>>(vi);
+}
+
+}  // namespace io
+}  // namespace stan
+
+#endif  // STAN_OPENCL
+
+#endif  // STAN_IO_OPENCL_UTILS_HPP
diff --git a/src/stan/model/model_base.hpp b/src/stan/model/model_base.hpp
index ad92415330d..d070ec4a728 100644
--- a/src/stan/model/model_base.hpp
+++ b/src/stan/model/model_base.hpp
@@ -6,9 +6,13 @@
 #endif
 #include <stan/io/var_context.hpp>
 #include <stan/math/rev/core.hpp>
+#ifdef STAN_OPENCL
+#include <stan/math/opencl/matrix_cl.hpp>
+#endif
 #include <stan/model/prob_grad.hpp>
 #include <stan/services/util/create_rng.hpp>
 #include <ostream>
+#include <stdexcept>
 #include <string>
 #include <utility>
 #include <vector>
@@ -203,6 +207,41 @@ class model_base : public prob_grad {
   virtual math::var log_prob(Eigen::Matrix<math::var, -1, 1>& params_r,
                              std::ostream* msgs) const = 0;
 
+#ifdef STAN_OPENCL
+  /**
+   * Return the log density for the specified OpenCL unconstrained parameters,
+   * without Jacobian and with normalizing constants for probability functions.
+   *
+   * @param[in] params_r unconstrained parameters on the device
+   * @param[in,out] msgs message stream
+   * @return log density for specified parameters
+   */
+  virtual math::var log_prob(math::matrix_cl<double>& params_r,
+                             std::ostream* msgs) const {
+    static_cast<void>(params_r);
+    static_cast<void>(msgs);
+    throw std::runtime_error(
+        "OpenCL log_prob not implemented for this model.");
+  }
+
+  /**
+   * Return the log density for the specified OpenCL unconstrained parameters,
+   * without Jacobian and with normalizing constants for probability functions.
+   *
+   * @param[in] params_r unconstrained parameters on the device with adjoints
+   * @param[in,out] msgs message stream
+   * @return log density for specified parameters
+   */
+  virtual math::var log_prob(
+      math::var_value<math::matrix_cl<double>>& params_r,
+      std::ostream* msgs) const {
+    static_cast<void>(params_r);
+    static_cast<void>(msgs);
+    throw std::runtime_error(
+        "OpenCL log_prob not implemented for this model.");
+  }
+#endif
+
   /**
    * Return the log density for the specified unconstrained
    * parameters, with Jacobian correction for constraints and with
diff --git a/src/stan/model/model_base_crtp.hpp b/src/stan/model/model_base_crtp.hpp
index cfb54a91ad5..c3cca5a1ed2 100644
--- a/src/stan/model/model_base_crtp.hpp
+++ b/src/stan/model/model_base_crtp.hpp
@@ -53,6 +53,19 @@ namespace model {
  *                  std::ostream* msgs = 0) const
  * ```
  *
+ * When STAN_OPENCL is defined, the derived class may also implement
+ * the OpenCL overloads to enable OpenCL deserialization:
+ *
+ * ```
+ * math::var log_prob(math::matrix_cl<double>& params_r,
+ *                    std::ostream* msgs = 0) const;
+ * math::var log_prob(math::var_value<math::matrix_cl<double>>& params_r,
+ *                    std::ostream* msgs = 0) const;
+ * ```
+ *
+ * If these overloads are not provided, calling the OpenCL log_prob overload
+ * will throw at runtime.
+ *
  * <p>The derived class `M` must be declared following the curiously
  * recursive template pattern, for example, if `M` is `foo_model`,
  * then `foo_model` should be declared as
diff --git a/src/stan/model/model_header.hpp b/src/stan/model/model_header.hpp
index a3896ecc6cb..a603a90eb6f 100644
--- a/src/stan/model/model_header.hpp
+++ b/src/stan/model/model_header.hpp
@@ -5,6 +5,12 @@
 #include <stan/model/model_base_crtp.hpp>
 #include <stan/math.hpp>
 
+#ifdef STAN_OPENCL
+#include <stan/math/opencl/rev.hpp>
+#include <stan/io/opencl/deserializer.hpp>
+#include <stan/io/opencl/utils.hpp>
+#endif
+
 #include <stan/io/deserializer.hpp>
 #include <stan/io/serializer.hpp>
 
diff --git a/src/test/unit/io/deserializer_opencl_stdvector_test.cpp b/src/test/unit/io/deserializer_opencl_stdvector_test.cpp
new file mode 100644
index 00000000000..6634c4236cc
--- /dev/null
+++ b/src/test/unit/io/deserializer_opencl_stdvector_test.cpp
@@ -0,0 +1,89 @@
+#ifdef STAN_OPENCL
+#include <stan/io/deserializer.hpp>
+#include <stan/io/opencl/deserializer.hpp>
+#include <stan/io/opencl/utils.hpp>
+#include <stan/math/opencl/copy.hpp>
+#include <test/unit/math/expect_near_rel.hpp>
+#include <gtest/gtest.h>
+#include <complex>
+#include <numeric>
+
+namespace {
+void append_sizes(std::vector<size_t>& sizes, size_t count, size_t size) {
+  sizes.insert(sizes.end(), count, size);
+}
+
+Eigen::VectorXd make_params(const std::vector<size_t>& sizes) {
+  const size_t total = std::accumulate(sizes.begin(), sizes.end(), size_t{0});
+  Eigen::VectorXd params(static_cast<Eigen::Index>(total));
+  for (Eigen::Index i = 0; i < params.size(); ++i) {
+    params.coeffRef(i) = static_cast<double>(i + 1);
+  }
+  return params;
+}
+}  // namespace
+
+TEST(deserializer_opencl_stdvector, read_varied_containers) {
+  std::vector<int> theta_i;
+
+  std::vector<size_t> sizes;
+  append_sizes(sizes, 4, 1);   // std::vector<double>(4)
+  append_sizes(sizes, 4, 1);   // std::vector<std::complex<double>>(2)
+  append_sizes(sizes, 2, 4);   // std::vector<matrix_cl>(2, 2x2)
+  append_sizes(sizes, 6, 2);   // std::vector<std::vector<matrix_cl>>(2,3,1x2)
+
+  Eigen::VectorXd params = make_params(sizes);
+  auto align_elems = stan::io::internal::align_elems_from_device();
+  auto layout = stan::io::compute_serializer_layout(sizes, align_elems);
+  auto values = stan::io::allocate_serializer_buffer(layout, CL_MEM_READ_ONLY);
+  stan::io::copy_to_serialize_buffer(params, values, layout);
+
+  std::vector<double> params_vec(params.data(),
+                                 params.data() + params.size());
+  stan::io::deserializer<double> cpu(params_vec, theta_i);
+  stan::io::deserializer<stan::math::matrix_cl<double>> deserializer(values,
+                                                                     theta_i,
+                                                                     align_elems);
+
+  auto scalars = deserializer.read<std::vector<double>>(4);
+  auto scalars_ref = cpu.read<std::vector<double>>(4);
+  ASSERT_EQ(scalars.size(), scalars_ref.size());
+  for (size_t i = 0; i < scalars.size(); ++i) {
+    EXPECT_FLOAT_EQ(scalars_ref[i], scalars[i]);
+  }
+
+  auto complex_vals = deserializer.read<std::vector<std::complex<double>>>(2);
+  auto complex_ref = cpu.read<std::vector<std::complex<double>>>(2);
+  ASSERT_EQ(complex_vals.size(), complex_ref.size());
+  for (size_t i = 0; i < complex_vals.size(); ++i) {
+    EXPECT_FLOAT_EQ(complex_ref[i].real(), complex_vals[i].real());
+    EXPECT_FLOAT_EQ(complex_ref[i].imag(), complex_vals[i].imag());
+  }
+
+  auto mats = deserializer.read<std::vector<stan::math::matrix_cl<double>>>(
+      2, 2, 2);
+  auto mats_ref = cpu.read<std::vector<Eigen::MatrixXd>>(2, 2, 2);
+  ASSERT_EQ(mats.size(), mats_ref.size());
+  for (size_t i = 0; i < mats.size(); ++i) {
+    Eigen::MatrixXd mat = stan::math::from_matrix_cl(mats[i]);
+    stan::test::expect_near_rel("deserializer_opencl", mat, mats_ref[i]);
+  }
+
+  auto nested = deserializer.read<
+      std::vector<std::vector<stan::math::matrix_cl<double>>>>(2, 3, 1, 2);
+  auto nested_ref
+      = cpu.read<std::vector<std::vector<Eigen::MatrixXd>>>(2, 3, 1, 2);
+  ASSERT_EQ(nested.size(), nested_ref.size());
+  for (size_t i = 0; i < nested.size(); ++i) {
+    ASSERT_EQ(nested[i].size(), nested_ref[i].size());
+    for (size_t j = 0; j < nested[i].size(); ++j) {
+      Eigen::MatrixXd mat = stan::math::from_matrix_cl(nested[i][j]);
+      stan::test::expect_near_rel("deserializer_opencl", mat,
+                                  nested_ref[i][j]);
+    }
+  }
+}
+#else
+#include <gtest/gtest.h>
+TEST(deserializer_opencl_stdvector, dummy) { EXPECT_NO_THROW(); }
+#endif
diff --git a/src/test/unit/io/deserializer_opencl_test.cpp b/src/test/unit/io/deserializer_opencl_test.cpp
new file mode 100644
index 00000000000..acd7efa9b07
--- /dev/null
+++ b/src/test/unit/io/deserializer_opencl_test.cpp
@@ -0,0 +1,210 @@
+#ifdef STAN_OPENCL
+#include <stan/io/deserializer.hpp>
+#include <stan/io/opencl/deserializer.hpp>
+#include <stan/io/opencl/utils.hpp>
+#include <stan/math/opencl/copy.hpp>
+#include <stan/math/opencl/opencl_context.hpp>
+#include <stan/math/opencl/prim.hpp>
+#include <stan/math/prim.hpp>
+#include <test/unit/math/expect_near_rel.hpp>
+#include <gtest/gtest.h>
+#include <complex>
+#include <numeric>
+
+namespace {
+struct opencl_pack {
+  stan::math::matrix_cl<double> values;
+  stan::io::serializer_layout layout;
+  size_t align_elems;
+};
+
+opencl_pack pack_opencl_values(const Eigen::VectorXd& params,
+                               const std::vector<size_t>& sizes) {
+  opencl_pack pack;
+  pack.align_elems = stan::io::internal::align_elems_from_device();
+  pack.layout = stan::io::compute_serializer_layout(sizes, pack.align_elems);
+  pack.values = stan::io::allocate_serializer_buffer(pack.layout,
+                                                     CL_MEM_READ_ONLY);
+  stan::io::copy_to_serialize_buffer(params, pack.values, pack.layout);
+  return pack;
+}
+
+Eigen::VectorXd make_params(const std::vector<size_t>& sizes) {
+  const size_t total = std::accumulate(sizes.begin(), sizes.end(), size_t{0});
+  Eigen::VectorXd params(static_cast<Eigen::Index>(total));
+  for (Eigen::Index i = 0; i < params.size(); ++i) {
+    params.coeffRef(i) = static_cast<double>(i + 1);
+  }
+  return params;
+}
+}  // namespace
+
+TEST(deserializer_opencl_mixed, read_scalar_complex_vector_matrix) {
+  std::vector<int> theta_i{7};
+  std::vector<size_t> sizes{1, 1, 1, 3, 4, 4};
+  Eigen::VectorXd params = make_params(sizes);
+  auto pack = pack_opencl_values(params, sizes);
+
+  std::vector<double> params_vec(params.data(),
+                                 params.data() + params.size());
+  stan::io::deserializer<double> cpu(params_vec, theta_i);
+  stan::io::deserializer<stan::math::matrix_cl<double>> deserializer(
+      pack.values, theta_i, pack.align_elems);
+
+  double x = deserializer.read<double>();
+  EXPECT_FLOAT_EQ(cpu.read<double>(), x);
+
+  std::complex<double> z = deserializer.read<std::complex<double>>();
+  std::complex<double> z_ref = cpu.read<std::complex<double>>();
+  EXPECT_FLOAT_EQ(z_ref.real(), z.real());
+  EXPECT_FLOAT_EQ(z_ref.imag(), z.imag());
+
+  auto vec_cl = deserializer.read<stan::math::matrix_cl<double>>(3);
+  Eigen::VectorXd vec = stan::math::from_matrix_cl<Eigen::VectorXd>(vec_cl);
+  Eigen::VectorXd vec_ref = cpu.read<Eigen::VectorXd>(3);
+  stan::test::expect_near_rel("deserializer_opencl", vec, vec_ref);
+
+  auto row_cl = deserializer.read<stan::math::matrix_cl<double>>(1, 4);
+  Eigen::MatrixXd row = stan::math::from_matrix_cl<Eigen::MatrixXd>(row_cl);
+  Eigen::RowVectorXd row_ref = cpu.read<Eigen::RowVectorXd>(4);
+  stan::test::expect_near_rel("deserializer_opencl", row,
+                              Eigen::MatrixXd(row_ref));
+
+  auto mat_cl = deserializer.read<stan::math::matrix_cl<double>>(2, 2);
+  Eigen::MatrixXd mat = stan::math::from_matrix_cl<Eigen::MatrixXd>(mat_cl);
+  Eigen::MatrixXd mat_ref = cpu.read<Eigen::MatrixXd>(2, 2);
+  stan::test::expect_near_rel("deserializer_opencl", mat, mat_ref);
+
+  int i = deserializer.read<int>();
+  EXPECT_EQ(cpu.read<int>(), i);
+}
+
+TEST(deserializer_opencl_constraints, read_lb) {
+  std::vector<int> theta_i;
+  std::vector<size_t> sizes{3};
+  Eigen::VectorXd params = make_params(sizes);
+  auto pack = pack_opencl_values(params, sizes);
+
+  std::vector<double> params_vec(params.data(),
+                                 params.data() + params.size());
+  stan::io::deserializer<double> cpu(params_vec, theta_i);
+  stan::io::deserializer<stan::math::matrix_cl<double>> deserializer(
+      pack.values, theta_i, pack.align_elems);
+
+  double lp = 0.0;
+  auto lb_cl
+      = deserializer.read_constrain_lb<stan::math::matrix_cl<double>, true>(
+          -1.0, lp, 3);
+  double lp_ref = 0.0;
+  auto lb_ref = stan::math::lb_constrain<true>(cpu.read<Eigen::VectorXd>(3),
+                                               -1.0, lp_ref);
+  Eigen::VectorXd lb_host
+      = stan::math::from_matrix_cl<Eigen::VectorXd>(lb_cl);
+  stan::test::expect_near_rel("deserializer_opencl", lb_host, lb_ref);
+  EXPECT_NEAR(lp_ref, lp, 1e-8);
+}
+
+TEST(deserializer_opencl_constraints, read_ub) {
+  std::vector<int> theta_i;
+  std::vector<size_t> sizes{2};
+  Eigen::VectorXd params = make_params(sizes);
+  auto pack = pack_opencl_values(params, sizes);
+
+  std::vector<double> params_vec(params.data(),
+                                 params.data() + params.size());
+  stan::io::deserializer<double> cpu(params_vec, theta_i);
+  stan::io::deserializer<stan::math::matrix_cl<double>> deserializer(
+      pack.values, theta_i, pack.align_elems);
+
+  double lp = 0.0;
+  auto ub_cl
+      = deserializer.read_constrain_ub<stan::math::matrix_cl<double>, true>(
+          2.0, lp, 2);
+  double lp_ref = 0.0;
+  auto ub_ref = stan::math::ub_constrain<true>(cpu.read<Eigen::VectorXd>(2),
+                                               2.0, lp_ref);
+  Eigen::VectorXd ub_host
+      = stan::math::from_matrix_cl<Eigen::VectorXd>(ub_cl);
+  stan::test::expect_near_rel("deserializer_opencl", ub_host, ub_ref);
+  EXPECT_NEAR(lp_ref, lp, 1e-8);
+}
+
+TEST(deserializer_opencl_constraints, read_lub) {
+  std::vector<int> theta_i;
+  std::vector<size_t> sizes{4};
+  Eigen::VectorXd params = make_params(sizes);
+  auto pack = pack_opencl_values(params, sizes);
+
+  std::vector<double> params_vec(params.data(),
+                                 params.data() + params.size());
+  stan::io::deserializer<double> cpu(params_vec, theta_i);
+  stan::io::deserializer<stan::math::matrix_cl<double>> deserializer(
+      pack.values, theta_i, pack.align_elems);
+
+  double lp = 0.0;
+  auto lub_cl
+      = deserializer.read_constrain_lub<stan::math::matrix_cl<double>, true>(
+          -1.0, 1.0, lp, 4);
+  double lp_ref = 0.0;
+  auto lub_ref = stan::math::lub_constrain<true>(cpu.read<Eigen::VectorXd>(4),
+                                                 -1.0, 1.0, lp_ref);
+  Eigen::VectorXd lub_host
+      = stan::math::from_matrix_cl<Eigen::VectorXd>(lub_cl);
+  stan::test::expect_near_rel("deserializer_opencl", lub_host, lub_ref);
+  EXPECT_NEAR(lp_ref, lp, 1e-8);
+}
+
+TEST(deserializer_opencl_constraints, read_offset_multiplier) {
+  std::vector<int> theta_i;
+  std::vector<size_t> sizes{3};
+  Eigen::VectorXd params = make_params(sizes);
+  auto pack = pack_opencl_values(params, sizes);
+
+  std::vector<double> params_vec(params.data(),
+                                 params.data() + params.size());
+  stan::io::deserializer<double> cpu(params_vec, theta_i);
+  stan::io::deserializer<stan::math::matrix_cl<double>> deserializer(
+      pack.values, theta_i, pack.align_elems);
+
+  double lp = 0.0;
+  auto off_cl = deserializer
+                    .read_constrain_offset_multiplier<
+                        stan::math::matrix_cl<double>, true>(1.5, 2.0, lp, 3);
+  double lp_ref = 0.0;
+  auto off_ref = stan::math::offset_multiplier_constrain<true>(
+      cpu.read<Eigen::VectorXd>(3), 1.5, 2.0, lp_ref);
+  Eigen::VectorXd off_host
+      = stan::math::from_matrix_cl<Eigen::VectorXd>(off_cl);
+  stan::test::expect_near_rel("deserializer_opencl", off_host, off_ref);
+  EXPECT_NEAR(lp_ref, lp, 1e-8);
+}
+
+TEST(deserializer_opencl_constraints, subbuffer_addition) {
+  std::vector<int> theta_i;
+  std::vector<size_t> sizes{4, 4};
+  Eigen::VectorXd params = make_params(sizes);
+  auto pack = pack_opencl_values(params, sizes);
+
+  std::vector<double> params_vec(params.data(),
+                                 params.data() + params.size());
+  stan::io::deserializer<double> cpu(params_vec, theta_i);
+  stan::io::deserializer<stan::math::matrix_cl<double>> deserializer(
+      pack.values, theta_i, pack.align_elems);
+
+  auto a_cl = deserializer.read<stan::math::matrix_cl<double>>(2, 2);
+  auto b_cl = deserializer.read<stan::math::matrix_cl<double>>(2, 2);
+  stan::math::matrix_cl<double> sum_cl = a_cl + b_cl;
+
+  Eigen::MatrixXd a_ref = cpu.read<Eigen::MatrixXd>(2, 2);
+  Eigen::MatrixXd b_ref = cpu.read<Eigen::MatrixXd>(2, 2);
+  Eigen::MatrixXd sum_ref = a_ref + b_ref;
+
+  Eigen::MatrixXd sum_host
+      = stan::math::from_matrix_cl<Eigen::MatrixXd>(sum_cl);
+  stan::test::expect_near_rel("deserializer_opencl", sum_host, sum_ref);
+}
+#else
+#include <gtest/gtest.h>
+TEST(deserializer_opencl_mixed, dummy) { EXPECT_NO_THROW(); }
+TEST(deserializer_opencl_constraints, dummy) { EXPECT_NO_THROW(); }
+#endif
diff --git a/src/test/unit/io/deserializer_opencl_varmat_test.cpp b/src/test/unit/io/deserializer_opencl_varmat_test.cpp
new file mode 100644
index 00000000000..11f268673b1
--- /dev/null
+++ b/src/test/unit/io/deserializer_opencl_varmat_test.cpp
@@ -0,0 +1,89 @@
+#ifdef STAN_OPENCL
+#include <stan/io/opencl/deserializer.hpp>
+#include <stan/io/opencl/utils.hpp>
+#include <stan/math/opencl/copy.hpp>
+#include <stan/math/opencl/kernel_generator.hpp>
+#include <gtest/gtest.h>
+
+TEST(deserializer_opencl_varmat, read_and_adj) {
+  std::vector<int> theta_i;
+  Eigen::VectorXd params(6);
+  params << 1, 2, 3, 4, 5, 6;
+
+  std::vector<size_t> sizes{6};
+  auto align_elems = stan::io::internal::align_elems_from_device();
+  auto layout = stan::io::compute_serializer_layout(sizes, align_elems);
+  auto var_buf = stan::io::serialize_to_opencl(params, {}, sizes);
+
+  stan::io::deserializer<stan::math::var_value<stan::math::matrix_cl<double>>>
+      deserializer(var_buf, theta_i, align_elems);
+  auto mat_var = deserializer.read<stan::math::var_value<stan::math::matrix_cl<double>>>(3, 2);
+
+  Eigen::MatrixXd vals = stan::math::from_matrix_cl(mat_var.val());
+  EXPECT_EQ(vals.rows(), 3);
+  EXPECT_EQ(vals.cols(), 2);
+  EXPECT_FLOAT_EQ(vals(0, 0), 1.0);
+  EXPECT_FLOAT_EQ(vals(1, 0), 2.0);
+  EXPECT_FLOAT_EQ(vals(2, 0), 3.0);
+  EXPECT_FLOAT_EQ(vals(0, 1), 4.0);
+  EXPECT_FLOAT_EQ(vals(1, 1), 5.0);
+  EXPECT_FLOAT_EQ(vals(2, 1), 6.0);
+
+  mat_var.adj() = stan::math::constant(1.0, 3, 2);
+  mat_var.adj().wait_for_write_events();
+
+  Eigen::VectorXd full_adj
+      = stan::math::from_matrix_cl<Eigen::VectorXd>(var_buf.adj());
+  ASSERT_EQ(full_adj.size(), static_cast<int>(layout.total_size));
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_FLOAT_EQ(full_adj[i], 1.0);
+  }
+}
+
+TEST(deserializer_opencl_varmat, multiple_blocks_and_padding) {
+  std::vector<int> theta_i;
+  std::vector<size_t> sizes{3, 5};
+  auto align_elems = stan::io::internal::align_elems_from_device();
+  auto layout = stan::io::compute_serializer_layout(sizes, align_elems);
+
+  Eigen::VectorXd params(static_cast<Eigen::Index>(sizes[0] + sizes[1]));
+  for (Eigen::Index i = 0; i < params.size(); ++i) {
+    params.coeffRef(i) = static_cast<double>(i + 1);
+  }
+
+  auto var_buf = stan::io::serialize_to_opencl(params, {}, sizes);
+  stan::io::deserializer<stan::math::var_value<stan::math::matrix_cl<double>>>
+      deserializer(var_buf, theta_i, align_elems);
+
+  auto vec_var = deserializer.read<stan::math::var_value<
+      stan::math::matrix_cl<double>>>(3);
+  auto row_var = deserializer.read<stan::math::var_value<
+      stan::math::matrix_cl<double>>>(1, 5);
+
+  vec_var.adj() = stan::math::constant(1.0, 3, 1);
+  row_var.adj() = stan::math::constant(2.0, 1, 5);
+  vec_var.adj().wait_for_write_events();
+  row_var.adj().wait_for_write_events();
+
+  Eigen::VectorXd full_adj
+      = stan::math::from_matrix_cl<Eigen::VectorXd>(var_buf.adj());
+  ASSERT_EQ(full_adj.size(), static_cast<int>(layout.total_size));
+
+  std::vector<double> expected(layout.total_size, 0.0);
+  for (size_t i = 0; i < sizes.size(); ++i) {
+    const size_t block_size = sizes[i];
+    const size_t offset = layout.offsets[i];
+    const double value = (i == 0) ? 1.0 : 2.0;
+    for (size_t j = 0; j < block_size; ++j) {
+      expected[offset + j] = value;
+    }
+  }
+
+  for (size_t i = 0; i < expected.size(); ++i) {
+    EXPECT_FLOAT_EQ(expected[i], full_adj[static_cast<Eigen::Index>(i)]);
+  }
+}
+#else
+#include <gtest/gtest.h>
+TEST(deserializer_opencl_varmat, dummy) { EXPECT_NO_THROW(); }
+#endif
diff --git a/src/test/unit/io/opencl_subbuffer_ops_test.cpp b/src/test/unit/io/opencl_subbuffer_ops_test.cpp
new file mode 100644
index 00000000000..1737cea4ce7
--- /dev/null
+++ b/src/test/unit/io/opencl_subbuffer_ops_test.cpp
@@ -0,0 +1,54 @@
+#ifdef STAN_OPENCL
+#include <stan/io/opencl/utils.hpp>
+#include <stan/math/opencl/prim.hpp>
+#include <stan/math/opencl/copy.hpp>
+#include <test/unit/math/expect_near_rel.hpp>
+#include <gtest/gtest.h>
+
+TEST(opencl_subbuffer_ops, add_subbuffers) {
+  std::vector<size_t> sizes{4, 4};
+  const size_t align_elems = stan::io::internal::align_elems_from_device();
+  const auto layout = stan::io::compute_serializer_layout(sizes, align_elems);
+
+  Eigen::VectorXd params(8);
+  for (Eigen::Index i = 0; i < params.size(); ++i) {
+    params.coeffRef(i) = static_cast<double>(i + 1);
+  }
+
+  auto values = stan::io::allocate_serializer_buffer(layout, CL_MEM_READ_ONLY);
+  stan::io::copy_to_serialize_buffer(params, values, layout);
+
+  cl::Buffer parent = values.buffer();
+  cl_buffer_region region_a{layout.offsets[0] * sizeof(double),
+                            sizes[0] * sizeof(double)};
+  cl_buffer_region region_b{layout.offsets[1] * sizeof(double),
+                            sizes[1] * sizeof(double)};
+
+  cl::Buffer sub_a = parent.createSubBuffer(CL_MEM_READ_ONLY,
+                                            CL_BUFFER_CREATE_TYPE_REGION,
+                                            &region_a);
+  cl::Buffer sub_b = parent.createSubBuffer(CL_MEM_READ_ONLY,
+                                            CL_BUFFER_CREATE_TYPE_REGION,
+                                            &region_b);
+
+  stan::math::matrix_cl<double> a(sub_a, 2, 2);
+  stan::math::matrix_cl<double> b(sub_b, 2, 2);
+
+  stan::math::matrix_cl<double> sum = a + b;
+  Eigen::MatrixXd sum_host = stan::math::from_matrix_cl<Eigen::MatrixXd>(sum);
+
+  Eigen::Map<const Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic,
+                                  Eigen::ColMajor>>
+      a_ref(params.data(), 2, 2);
+  Eigen::Map<const Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic,
+                                  Eigen::ColMajor>>
+      b_ref(params.data() + 4, 2, 2);
+
+  Eigen::MatrixXd expected = a_ref + b_ref;
+  stan::test::expect_near_rel("opencl_subbuffer_ops", sum_host, expected);
+}
+
+#else
+#include <gtest/gtest.h>
+TEST(opencl_subbuffer_ops, dummy) { EXPECT_NO_THROW(); }
+#endif
diff --git a/src/test/unit/io/serializer_opencl_layout_test.cpp b/src/test/unit/io/serializer_opencl_layout_test.cpp
new file mode 100644
index 00000000000..79a1ef236cc
--- /dev/null
+++ b/src/test/unit/io/serializer_opencl_layout_test.cpp
@@ -0,0 +1,181 @@
+#ifdef STAN_OPENCL
+#include <stan/io/opencl/utils.hpp>
+#include <stan/math/opencl/copy.hpp>
+#include <stan/math/opencl/opencl_context.hpp>
+#include <stan/math/rev/core.hpp>
+#include <gtest/gtest.h>
+#include <numeric>
+#include <vector>
+
+namespace {
+Eigen::VectorXd make_params(const std::vector<size_t>& sizes) {
+  const size_t total = std::accumulate(sizes.begin(), sizes.end(), size_t{0});
+  Eigen::VectorXd params(static_cast<Eigen::Index>(total));
+  for (Eigen::Index i = 0; i < params.size(); ++i) {
+    params.coeffRef(i) = static_cast<double>(i + 1);
+  }
+  return params;
+}
+
+void expect_layout_matches(const Eigen::VectorXd& full,
+                           const stan::io::serializer_layout& layout,
+                           const Eigen::VectorXd& params) {
+  std::vector<char> is_data(layout.total_size, 0);
+  size_t src_offset = 0;
+  for (size_t i = 0; i < layout.sizes.size(); ++i) {
+    const size_t offset = layout.offsets[i];
+    const size_t block_size = layout.sizes[i];
+    for (size_t j = 0; j < block_size; ++j) {
+      EXPECT_FLOAT_EQ(full[offset + j], params[src_offset + j]);
+      is_data[offset + j] = 1;
+    }
+    src_offset += block_size;
+  }
+  for (size_t i = 0; i < layout.total_size; ++i) {
+    if (!is_data[i]) {
+      EXPECT_FLOAT_EQ(full[static_cast<Eigen::Index>(i)], 0.0);
+    }
+  }
+}
+}  // namespace
+
+TEST(serializer_opencl_layout, compute_layout) {
+  std::vector<size_t> sizes{1, 7, 3};
+  auto layout = stan::io::compute_serializer_layout(sizes, 4);
+  ASSERT_EQ(layout.offsets.size(), sizes.size());
+  EXPECT_EQ(layout.offsets[0], 0U);
+  EXPECT_EQ(layout.offsets[1], 4U);
+  EXPECT_EQ(layout.offsets[2], 12U);
+  EXPECT_EQ(layout.total_size, 15U);
+}
+
+TEST(serializer_opencl_layout, compute_layout_no_padding) {
+  std::vector<size_t> sizes{2, 3, 5};
+  auto layout = stan::io::compute_serializer_layout(sizes, 1);
+  ASSERT_EQ(layout.offsets.size(), sizes.size());
+  EXPECT_EQ(layout.offsets[0], 0U);
+  EXPECT_EQ(layout.offsets[1], 2U);
+  EXPECT_EQ(layout.offsets[2], 5U);
+  EXPECT_EQ(layout.total_size, 10U);
+}
+
+TEST(serializer_opencl_layout, allocate_empty_buffer) {
+  std::vector<size_t> sizes;
+  auto layout = stan::io::compute_serializer_layout(sizes, 4);
+  auto values = stan::io::allocate_serializer_buffer(layout, CL_MEM_READ_ONLY);
+  EXPECT_EQ(values.size(), 0);
+  EXPECT_EQ(values.rows(), 0);
+  EXPECT_EQ(values.cols(), 0);
+}
+
+TEST(serializer_opencl_layout, allocate_buffer_shape) {
+  std::vector<size_t> sizes{3};
+  auto layout = stan::io::compute_serializer_layout(sizes, 4);
+  auto values = stan::io::allocate_serializer_buffer(layout, CL_MEM_READ_ONLY);
+  EXPECT_EQ(values.rows(), static_cast<int>(layout.total_size));
+  EXPECT_EQ(values.cols(), 1);
+}
+
+TEST(serializer_opencl_layout, copy_to_buffer) {
+  std::vector<size_t> sizes{2, 3};
+  auto layout = stan::io::compute_serializer_layout(sizes, 4);
+  stan::math::matrix_cl<double> values
+      = stan::io::allocate_serializer_buffer(layout, CL_MEM_READ_ONLY);
+
+  if (layout.total_size > 0) {
+    auto& queue = stan::math::opencl_context.queue();
+    double zero = 0.0;
+    queue.enqueueFillBuffer(values.buffer(), zero, 0,
+                            sizeof(double) * layout.total_size);
+    queue.finish();
+  }
+
+  Eigen::VectorXd params(5);
+  params << 1.0, 2.0, 3.0, 4.0, 5.0;
+  stan::io::copy_to_serialize_buffer(params, values, layout);
+
+  Eigen::VectorXd full = stan::math::from_matrix_cl<Eigen::VectorXd>(values);
+  ASSERT_EQ(full.size(), 7);
+  EXPECT_FLOAT_EQ(full[0], 1.0);
+  EXPECT_FLOAT_EQ(full[1], 2.0);
+  EXPECT_FLOAT_EQ(full[2], 0.0);
+  EXPECT_FLOAT_EQ(full[3], 0.0);
+  EXPECT_FLOAT_EQ(full[4], 3.0);
+  EXPECT_FLOAT_EQ(full[5], 4.0);
+  EXPECT_FLOAT_EQ(full[6], 5.0);
+}
+
+TEST(serializer_opencl_layout, copy_to_buffer_many_blocks) {
+  std::vector<size_t> sizes{1, 6, 1, 4, 8};
+  auto layout = stan::io::compute_serializer_layout(sizes, 4);
+  auto values = stan::io::allocate_serializer_buffer(layout, CL_MEM_READ_ONLY);
+
+  if (layout.total_size > 0) {
+    auto& queue = stan::math::opencl_context.queue();
+    double zero = 0.0;
+    queue.enqueueFillBuffer(values.buffer(), zero, 0,
+                            sizeof(double) * layout.total_size);
+    queue.finish();
+  }
+
+  Eigen::VectorXd params = make_params(sizes);
+  stan::io::copy_to_serialize_buffer(params, values, layout);
+
+  Eigen::VectorXd full = stan::math::from_matrix_cl<Eigen::VectorXd>(values);
+  ASSERT_EQ(full.size(), static_cast<Eigen::Index>(layout.total_size));
+  expect_layout_matches(full, layout, params);
+}
+
+TEST(serializer_opencl_layout, serialize_to_opencl_roundtrip) {
+  std::vector<size_t> sizes{1, 2, 3, 4, 4, 12};
+  Eigen::VectorXd params = make_params(sizes);
+  std::vector<std::vector<size_t>> dimss;
+  auto params_opencl = stan::io::serialize_to_opencl(params, dimss, sizes);
+
+  auto align_elems = stan::io::internal::align_elems_from_device();
+  auto layout = stan::io::compute_serializer_layout(sizes, align_elems);
+  Eigen::VectorXd full_vals
+      = stan::math::from_matrix_cl<Eigen::VectorXd>(params_opencl.val());
+  ASSERT_EQ(full_vals.size(), static_cast<Eigen::Index>(layout.total_size));
+  expect_layout_matches(full_vals, layout, params);
+
+  EXPECT_EQ(params_opencl.adj().rows(),
+            static_cast<int>(layout.total_size));
+  EXPECT_EQ(params_opencl.adj().cols(), 1);
+  stan::math::recover_memory();
+}
+
+TEST(serializer_opencl_layout, stdvector_dimensions_match_cpu_shapes) {
+  std::vector<size_t> sizes{
+      10, 10,  // std::vector<Eigen::VectorXd>(2, 10)
+      20, 20,  // std::vector<complex vec>(2, 10) -> 2 scalars per element
+      10, 10,  // std::vector<Eigen::RowVectorXd>(2, 10)
+      20, 20,  // std::vector<complex rowvec>(2, 10)
+      16, 16,  // std::vector<Eigen::MatrixXd>(2, 4x4)
+      32, 32,  // std::vector<complex mat>(2, 4x4)
+      10, 10,  // std::vector<std::vector<double>>(2, 10)
+      20, 20   // std::vector<std::vector<complex>>(2, 10)
+  };
+
+  auto align_elems = stan::io::internal::align_elems_from_device();
+  auto layout = stan::io::compute_serializer_layout(sizes, align_elems);
+  auto values = stan::io::allocate_serializer_buffer(layout, CL_MEM_READ_ONLY);
+
+  if (layout.total_size > 0) {
+    auto& queue = stan::math::opencl_context.queue();
+    double zero = 0.0;
+    queue.enqueueFillBuffer(values.buffer(), zero, 0,
+                            sizeof(double) * layout.total_size);
+    queue.finish();
+  }
+
+  Eigen::VectorXd params = make_params(sizes);
+  stan::io::copy_to_serialize_buffer(params, values, layout);
+  Eigen::VectorXd full = stan::math::from_matrix_cl<Eigen::VectorXd>(values);
+  ASSERT_EQ(full.size(), static_cast<Eigen::Index>(layout.total_size));
+  expect_layout_matches(full, layout, params);
+}
+#else
+#include <gtest/gtest.h>
+TEST(serializer_opencl_layout, dummy) { EXPECT_NO_THROW(); }
+#endif
diff --git a/src/test/unit/model/model_base_crtp_test.cpp b/src/test/unit/model/model_base_crtp_test.cpp
index 6db95fa208c..5b2a7977cde 100644
--- a/src/test/unit/model/model_base_crtp_test.cpp
+++ b/src/test/unit/model/model_base_crtp_test.cpp
@@ -1,6 +1,9 @@
 #include <gtest/gtest.h>
 #include <stan/model/model_base.hpp>
 #include <stan/model/model_base_crtp.hpp>
+#ifdef STAN_OPENCL
+#include <stan/math/opencl/matrix_cl.hpp>
+#endif
 #include <ostream>
 #include <stdexcept>
 #include <string>
@@ -57,6 +60,19 @@ struct mock_model : public stan::model::model_base_crtp<mock_model> {
     }
   }
 
+#ifdef STAN_OPENCL
+  stan::math::var log_prob(stan::math::matrix_cl<double>& params_r,
+                           std::ostream* msgs) const override {
+    return 9;
+  }
+
+  stan::math::var log_prob(
+      stan::math::var_value<stan::math::matrix_cl<double>>& params_r,
+      std::ostream* msgs) const override {
+    return 10;
+  }
+#endif
+
   void transform_inits(const stan::io::var_context& context,
                        Eigen::VectorXd& params_r,
                        std::ostream* msgs) const override {}
diff --git a/src/test/unit/model/model_base_test.cpp b/src/test/unit/model/model_base_test.cpp
index c4a81f71323..2dc2af11c0e 100644
--- a/src/test/unit/model/model_base_test.cpp
+++ b/src/test/unit/model/model_base_test.cpp
@@ -1,5 +1,8 @@
 #include <gtest/gtest.h>
 #include <stan/model/model_base.hpp>
+#ifdef STAN_OPENCL
+#include <stan/math/opencl/matrix_cl.hpp>
+#endif
 #include <ostream>
 #include <stdexcept>
 #include <string>
@@ -42,6 +45,19 @@ struct mock_model : public stan::model::model_base {
     return 2;
   }
 
+#ifdef STAN_OPENCL
+  stan::math::var log_prob(stan::math::matrix_cl<double>& params_r,
+                           std::ostream* msgs) const override {
+    return 9;
+  }
+
+  stan::math::var log_prob(
+      stan::math::var_value<stan::math::matrix_cl<double>>& params_r,
+      std::ostream* msgs) const override {
+    return 10;
+  }
+#endif
+
   double log_prob_jacobian(Eigen::VectorXd& params_r,
                            std::ostream* msgs) const override {
     return 3;
diff --git a/src/test/unit/model/opencl_log_prob_test.cpp b/src/test/unit/model/opencl_log_prob_test.cpp
new file mode 100644
index 00000000000..c1c6e25e210
--- /dev/null
+++ b/src/test/unit/model/opencl_log_prob_test.cpp
@@ -0,0 +1,169 @@
+#ifdef STAN_OPENCL
+
+#include <gtest/gtest.h>
+#include <stan/io/deserializer.hpp>
+#include <stan/io/opencl/deserializer.hpp>
+#include <stan/io/opencl/utils.hpp>
+#include <stan/math.hpp>
+#include <stan/math/opencl/rev.hpp>
+#include <stan/model/model_base_crtp.hpp>
+#include <Eigen/Dense>
+#include <ostream>
+#include <string>
+#include <vector>
+
+namespace {
+
+class opencl_mock_model
+    : public stan::model::model_base_crtp<opencl_mock_model> {
+ public:
+  using var_matrix_cl_t
+      = stan::math::var_value<stan::math::matrix_cl<double>>;
+
+  opencl_mock_model(size_t a_size, size_t b_size)
+      : model_base_crtp(a_size + b_size),
+        a_size_(a_size),
+        b_size_(b_size) {}
+
+  std::string model_name() const override { return "opencl_mock_model"; }
+
+  std::vector<std::string> model_compile_info() const override { return {}; }
+
+  void get_param_names(std::vector<std::string>& names, bool include_tparams,
+                       bool include_gqs) const override {
+    names.clear();
+    names.emplace_back("a");
+    names.emplace_back("b");
+  }
+
+  void get_dims(std::vector<std::vector<size_t>>& dimss, bool include_tparams,
+                bool include_gqs) const override {
+    dimss.clear();
+    dimss.emplace_back(std::vector<size_t>{a_size_});
+    dimss.emplace_back(std::vector<size_t>{b_size_});
+  }
+
+  void constrained_param_names(std::vector<std::string>& param_names,
+                               bool include_tparams,
+                               bool include_gqs) const override {
+    param_names.clear();
+  }
+
+  void unconstrained_param_names(std::vector<std::string>& param_names,
+                                 bool include_tparams,
+                                 bool include_gqs) const override {
+    param_names.clear();
+  }
+
+  template <bool propto, bool jacobian, typename T>
+  T log_prob(Eigen::Matrix<T, -1, 1>& params_r, std::ostream* msgs) const {
+    std::vector<int> params_i;
+    stan::io::deserializer<T> in(params_r, params_i);
+    using vec_t = Eigen::Matrix<T, Eigen::Dynamic, 1>;
+    vec_t a = in.template read<vec_t>(static_cast<Eigen::Index>(a_size_));
+    vec_t b = in.template read<vec_t>(static_cast<Eigen::Index>(b_size_));
+    return stan::math::dot_product(a, a) + stan::math::dot_product(b, b);
+  }
+
+  template <bool propto, bool jacobian, typename T>
+  T log_prob(std::vector<T>& params_r, std::vector<int>& params_i,
+             std::ostream* msgs) const {
+    return 0;
+  }
+
+  stan::math::var log_prob(stan::math::matrix_cl<double>& params_r,
+                           std::ostream* msgs) const override {
+    std::vector<int> params_i;
+    size_t align_elems = stan::io::internal::align_elems_from_device();
+    stan::io::deserializer<stan::math::matrix_cl<double>> in(
+        params_r, params_i, align_elems);
+    auto a = in.template read<stan::math::matrix_cl<double>>(
+        static_cast<Eigen::Index>(a_size_));
+    auto b = in.template read<stan::math::matrix_cl<double>>(
+        static_cast<Eigen::Index>(b_size_));
+    double lp = stan::math::dot_product(a, a) + stan::math::dot_product(b, b);
+    return stan::math::var(lp);
+  }
+
+  stan::math::var log_prob(
+      stan::math::var_value<stan::math::matrix_cl<double>>& params_r,
+      std::ostream* msgs) const override {
+    std::vector<int> params_i;
+    size_t align_elems = stan::io::internal::align_elems_from_device();
+    stan::io::deserializer<var_matrix_cl_t> in(params_r, params_i, align_elems);
+    auto a = in.template read<var_matrix_cl_t>(
+        static_cast<Eigen::Index>(a_size_));
+    auto b = in.template read<var_matrix_cl_t>(
+        static_cast<Eigen::Index>(b_size_));
+    return stan::math::dot_product(a, a) + stan::math::dot_product(b, b);
+  }
+
+  void transform_inits(const stan::io::var_context& context,
+                       Eigen::VectorXd& params_r,
+                       std::ostream* msgs) const override {}
+
+  template <typename RNG>
+  void write_array(RNG& base_rng, Eigen::VectorXd& params_r,
+                   Eigen::VectorXd& params_constrained_r, bool include_tparams,
+                   bool include_gqs, std::ostream* msgs) const {}
+
+  void unconstrain_array(const Eigen::VectorXd& params_constrained_r,
+                         Eigen::VectorXd& params_r,
+                         std::ostream* msgs = nullptr) const override {}
+
+  void transform_inits(const stan::io::var_context& context,
+                       std::vector<int>& params_i,
+                       std::vector<double>& params_r,
+                       std::ostream* msgs) const override {}
+
+  template <typename RNG>
+  void write_array(RNG& base_rng, std::vector<double>& params_r,
+                   std::vector<int>& params_i,
+                   std::vector<double>& params_r_constrained,
+                   bool include_tparams, bool include_gqs,
+                   std::ostream* msgs) const {}
+
+  void unconstrain_array(const std::vector<double>& params_constrained_r,
+                         std::vector<double>& params_r,
+                         std::ostream* msgs = nullptr) const override {}
+
+ private:
+  size_t a_size_;
+  size_t b_size_;
+};
+
+}  // namespace
+
+TEST(model, openclLogProbMatchesCpu) {
+  size_t align_elems = stan::io::internal::align_elems_from_device();
+  size_t a_size = align_elems > 1 ? align_elems - 1 : 2;
+  size_t b_size = 3;
+  opencl_mock_model model(a_size, b_size);
+
+  Eigen::VectorXd params(static_cast<Eigen::Index>(a_size + b_size));
+  for (Eigen::Index i = 0; i < params.size(); ++i) {
+    params.coeffRef(i) = static_cast<double>(i + 1);
+  }
+
+  double expected = params.squaredNorm();
+
+  std::vector<std::vector<size_t>> dimss;
+  std::vector<size_t> sizes{a_size, b_size};
+  auto params_opencl = stan::io::serialize_to_opencl(params, dimss, sizes);
+
+  auto lp_opencl = model.log_prob(params_opencl, nullptr);
+  EXPECT_NEAR(expected, lp_opencl.val(), 1e-12);
+
+  stan::math::matrix_cl<double> params_vals = params_opencl.val();
+  auto lp_opencl_prim = model.log_prob(params_vals, nullptr);
+  EXPECT_NEAR(expected, lp_opencl_prim.val(), 1e-12);
+
+  stan::math::recover_memory();
+}
+
+#else
+
+#include <gtest/gtest.h>
+TEST(model, openclLogProbDummy) { EXPECT_NO_THROW(); }
+
+#endif
diff --git a/src/test/unit/services/util/mcmc_writer_test.cpp b/src/test/unit/services/util/mcmc_writer_test.cpp
index c9840a68806..92afc6bae81 100644
--- a/src/test/unit/services/util/mcmc_writer_test.cpp
+++ b/src/test/unit/services/util/mcmc_writer_test.cpp
@@ -7,6 +7,9 @@
 #include <stan/callbacks/writer.hpp>
 #include <stan/io/empty_var_context.hpp>
 #include <stan/services/util/create_rng.hpp>
+#ifdef STAN_OPENCL
+#include <stan/math/opencl/matrix_cl.hpp>
+#endif
 
 namespace test {
 // mock_throwing_model_in_write_array throws exception in the write_array()
@@ -39,6 +42,20 @@ class throwing_model : public stan::model::model_base_crtp<throwing_model> {
     return 0.0;
   }  // log_prob()
 
+#ifdef STAN_OPENCL
+  inline stan::math::var log_prob(stan::math::matrix_cl<double>& params_r,
+                                  std::ostream* pstream__ = nullptr) const
+      override {
+    return 0.0;
+  }
+
+  inline stan::math::var log_prob(
+      stan::math::var_value<stan::math::matrix_cl<double>>& params_r,
+      std::ostream* pstream__ = nullptr) const override {
+    return 0.0;
+  }
+#endif
+
   template <typename RNG>
   void write_array(RNG& base_rng__, std::vector<double>& params_r__,
                    std::vector<int>& params_i__, std::vector<double>& vars__,