Open ianhbell opened 3 years ago
This is not so much faster:
template<int Norder, typename VecType>
double y_Clenshaw_x2_paired(const double x, const VecType& c, const double xmin, const double xmax) {
auto xscaled2 = (2 * x - (xmax + xmin)) / (xmax - xmin) * 2.0; // x*2
double u_kp1 = 0, u_kp2 = 0;
for (int k = Norder; k >= 2; k -= 2) {
// Do the recurrent calculation
u_kp2 = c[k] + xscaled2 * u_kp1 - u_kp2;
u_kp1 = c[k-1] + xscaled2 * u_kp2 - u_kp1;
}
if (Norder % 2) {
auto tmp = u_kp1;
u_kp1 = c[1] + xscaled2 * u_kp1 - u_kp2;
u_kp2 = tmp;
}
return c[0] + 0.5*xscaled2*u_kp1 - u_kp2;
}
I have used this code and I recall getting a substantial increase in speed for fixed size polynomials:
template<typename T>
T T0(const T& x) {
return 1.0;
};
template<> inline
Eigen::ArrayXd T0(const Eigen::ArrayXd& x) {
Eigen::ArrayXd res;
res.resizeLike(x);
res.setConstant(1.0);
return res;
};
template<typename T>
T T1(const T& x) {
return x;
};
template<typename T>
T T2(const T& x) {
return (2.0 * x*x) - 1.0;
};
template<typename T>
T Tn(std::size_t n, const T& x) {
if (n == 0) return T0(x);
else if (n == 1) return T1(x);
else if (n == 2) return T2(x);
//Potentially faster than the recursive version
//return (2.0 * x * Tn(n - 1, x)) - Tn(n - 2, x);
T tnm1 = T2(x);
T tnm2 = T1(x);
T tn = tnm1;
for (std::size_t l = 3; l < n + 1; l++) {
tn = (2.0 * x * tnm1) - tnm2;
tnm2 = tnm1;
tnm1 = tn;
}
return tn;
};
template<typename T>
void Tn_series(int n, const T& x, Eigen::Array<T, Eigen::Dynamic, 1>& res) {
res.resize(0);
if (n < 0) return;
res.resize(n + 1);
res(0, 0) = T0(x);
if (n == 0) return;
res(1, 0) = T1(x);
if (n == 1) return;
res(2, 0) = T2(x);
if (n == 2) return;
//Potentially faster than the recursive version
//return (2.0 * x * Tn(n - 1, x)) - Tn(n - 2, x);
T tnm1 = res(2, 0);
T tnm2 = res(1, 0);
T tn = tnm1;
for (int i = 3; i < n + 1; i++) {
tn = (2.0 * x * tnm1) - tnm2;
tnm2 = tnm1;
tnm1 = tn;
res(i, 0) = tn;
}
return;
};
template<typename T>
void Tn_series(int n, const Eigen::Array<T, Eigen::Dynamic, 1>& x, Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>& res) {
res.resize(0, 0);
if (n < 0) return;
try {
res.resize(x.rows(), n + 1);
} catch (...) {
res.resize(0, 0);
return;
}
res.col(0) = T0(x);
if (n == 0) return;
res.col(1) = T1(x);
if (n == 1) return;
res.col(2) = T2(x);
if (n == 2) return;
//Potentially faster than the recursive version
//res.col(i) = (2.0 * x * Tn(n - 1, x)) - Tn(n - 2, x);
Eigen::Array<T, Eigen::Dynamic, 1> tnm1 = res.col(2);
Eigen::Array<T, Eigen::Dynamic, 1> tnm2 = res.col(1);
Eigen::Array<T, Eigen::Dynamic, 1> tn = tnm1;
for (int i = 3; i < n + 1; i++) {
tn = (2.0 * x * tnm1) - tnm2;
tnm2 = tnm1;
tnm1 = tn;
res.col(i) = tn;
}
return;
};
The actual calculation was embedded in a class, but you get the idea from this:
virtual const ErrCodes _evaluate_matrix(const double& x_, double& y) const {
if (_coeffs.rows() < 1) { y = NAN; return error; }
if (_coeffs.cols() != 1) { y = NAN; return error; }
//if (x_ < -1) { y = NAN; return error; }
//if (x_ > 1) { y = NAN; return error; }
const double x = (x_ - _x_start - _x_halfwidth) / _x_halfwidth;
const int x_degree = _coeffs.rows() - 1;
ObservationType res;
Tn_series(x_degree, x, res);
y = (res*_coeffs).sum();
return noError;
}
const ErrCodes _evaluate_matrix(const ObservationType& x_, ObservationType& y) const {
if (_coeffs.rows() < 1) { y.setConstant(NAN); return error; }
if (_coeffs.cols() != 1) { y.setConstant(NAN); return error; }
//if ((x < -1).any()) { y.setConstant(NAN); return error; }
//if ((x > 1).any()) { y.setConstant(NAN); return error; }
try {
y.resizeLike(x_);
y.setConstant(NAN);
} catch (...) {
y.resize(0, 1);
return error;
}
const ObservationType x = (x_ - _x_start - _x_halfwidth) / _x_halfwidth;
const int x_degree = _coeffs.rows() - 1;
CoefficientType x_res;
Tn_series(x_degree, x, x_res);
//y = x_res.matrix() * _coeffs.matrix();
for (int i = 0; i < x_res.rows(); i++) {
y(i) = (x_res.row(i) * _coeffs.transpose()).sum();
}
return noError;
}
I think the point here is that the bonus you get is a consequence of the fact that you are leveraging parallelism at the level of matrix x vector products, a place where Eigen has been extensively optimized. If you were to do a single input, I expect you would see a penalty due to the fact that you have the additional array overhead.
See chebfun's implementation: https://github.com/chebfun/chebfun/blob/master/%40chebtech/clenshaw.m