Add better Clenshaw - Githubissues

ianhbell commented 3 years ago

See chebfun's implementation: https://github.com/chebfun/chebfun/blob/master/%40chebtech/clenshaw.m

ianhbell commented 3 years ago

This is not so much faster:

template<int Norder, typename VecType>
double y_Clenshaw_x2_paired(const double x, const VecType& c, const double xmin, const double xmax) {
    auto xscaled2 = (2 * x - (xmax + xmin)) / (xmax - xmin) * 2.0; // x*2
    double u_kp1 = 0, u_kp2 = 0;
    for (int k = Norder; k >= 2; k -= 2) {
        // Do the recurrent calculation
        u_kp2 = c[k] + xscaled2 * u_kp1 - u_kp2;
        u_kp1 = c[k-1] + xscaled2 * u_kp2 - u_kp1;
    }
    if (Norder % 2) {
        auto tmp = u_kp1;
        u_kp1 = c[1] + xscaled2 * u_kp1 - u_kp2;
        u_kp2 = tmp;
    }
    return c[0] + 0.5*xscaled2*u_kp1 - u_kp2;
}

jowr commented 3 years ago

I have used this code and I recall getting a substantial increase in speed for fixed size polynomials:

template<typename T>
T T0(const T& x) {
    return 1.0;
};

template<> inline
Eigen::ArrayXd T0(const Eigen::ArrayXd& x) {
Eigen::ArrayXd res;
res.resizeLike(x);
res.setConstant(1.0);
return res;
};

template<typename T>
T T1(const T& x) {
    return x;
};

template<typename T>
T T2(const T& x) {
    return (2.0 * x*x) - 1.0;
};

template<typename T>
T Tn(std::size_t n, const T& x) {
    if (n == 0) return T0(x);
    else if (n == 1) return T1(x);
    else if (n == 2) return T2(x);

    //Potentially faster than the recursive version
    //return (2.0 * x * Tn(n - 1, x)) - Tn(n - 2, x);

    T tnm1 = T2(x);
    T tnm2 = T1(x);
    T tn = tnm1;

    for (std::size_t l = 3; l < n + 1; l++) {
        tn = (2.0 * x * tnm1) - tnm2;
        tnm2 = tnm1;
        tnm1 = tn;
    }

    return tn;
};

template<typename T>
void Tn_series(int n, const T& x, Eigen::Array<T, Eigen::Dynamic, 1>& res) {
    res.resize(0);
    if (n < 0) return;
    res.resize(n + 1);

    res(0, 0) = T0(x);
    if (n == 0) return;

    res(1, 0) = T1(x);
    if (n == 1) return;

    res(2, 0) = T2(x);
    if (n == 2) return;

    //Potentially faster than the recursive version
    //return (2.0 * x * Tn(n - 1, x)) - Tn(n - 2, x);

    T tnm1 = res(2, 0);
    T tnm2 = res(1, 0);
    T tn = tnm1;

    for (int i = 3; i < n + 1; i++) {
        tn = (2.0 * x * tnm1) - tnm2;
        tnm2 = tnm1;
        tnm1 = tn;
        res(i, 0) = tn;
    }

    return;
};

template<typename T>
void Tn_series(int n, const Eigen::Array<T, Eigen::Dynamic, 1>& x, Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>& res) {
    res.resize(0, 0);
    if (n < 0) return;
    try {
        res.resize(x.rows(), n + 1);
    } catch (...) {
        res.resize(0, 0);
        return;
    }

    res.col(0) = T0(x);
    if (n == 0) return;

    res.col(1) = T1(x);
    if (n == 1) return;

    res.col(2) = T2(x);
    if (n == 2) return;

    //Potentially faster than the recursive version
    //res.col(i) = (2.0 * x * Tn(n - 1, x)) - Tn(n - 2, x);

    Eigen::Array<T, Eigen::Dynamic, 1> tnm1 = res.col(2);
    Eigen::Array<T, Eigen::Dynamic, 1> tnm2 = res.col(1);
    Eigen::Array<T, Eigen::Dynamic, 1> tn = tnm1;

    for (int i = 3; i < n + 1; i++) {
        tn = (2.0 * x * tnm1) - tnm2;
        tnm2 = tnm1;
        tnm1 = tn;
        res.col(i) = tn;
    }
    return;
};

The actual calculation was embedded in a class, but you get the idea from this:

virtual const ErrCodes _evaluate_matrix(const double& x_, double& y) const {
    if (_coeffs.rows() <  1) { y = NAN; return error; }
    if (_coeffs.cols() != 1) { y = NAN; return error; }
    //if (x_ < -1) { y = NAN; return error; }
    //if (x_ >  1) { y = NAN; return error; }

    const double x = (x_ - _x_start - _x_halfwidth) / _x_halfwidth;
    const int x_degree = _coeffs.rows() - 1;

    ObservationType res;
    Tn_series(x_degree, x, res);
    y = (res*_coeffs).sum();
    return noError;
}

const ErrCodes _evaluate_matrix(const ObservationType& x_, ObservationType& y) const {
    if (_coeffs.rows() <  1) { y.setConstant(NAN); return error; }
    if (_coeffs.cols() != 1) { y.setConstant(NAN); return error; }
    //if ((x < -1).any()) { y.setConstant(NAN); return error; }
    //if ((x > 1).any()) { y.setConstant(NAN); return error; }

    try {
        y.resizeLike(x_);
        y.setConstant(NAN);
    } catch (...) {
        y.resize(0, 1);
        return error;
    }

    const ObservationType x = (x_ - _x_start - _x_halfwidth) / _x_halfwidth;
    const int x_degree = _coeffs.rows() - 1;

    CoefficientType x_res;
    Tn_series(x_degree, x, x_res);

    //y = x_res.matrix() * _coeffs.matrix();
    for (int i = 0; i < x_res.rows(); i++) {
        y(i) = (x_res.row(i) * _coeffs.transpose()).sum();
    }
    return noError;
}

ianhbell commented 3 years ago

I think the point here is that the bonus you get is a consequence of the fact that you are leveraging parallelism at the level of matrix x vector products, a place where Eigen has been extensively optimized. If you were to do a single input, I expect you would see a penalty due to the fact that you have the additional array overhead.

usnistgov / ChebTools

Add better Clenshaw #29