Closed ericyan71 closed 5 years ago
0.5f
than dividing by 2
.You can also avoid temporaries in the last assigns:
auto res0 = xt::view(result, xt::all(), 0);
auto res1 = xt::view(result, xt::all(), 1);
auto res2 = xt::view(result, xt::all(), 2);
auto res3 = xt::view(result, xt::all(), 3);
xt::noalias(res0) = ty - th / 2;
xt::noalias(res1) = tx - tw / 2;
xt::noalias(res2) = ty + th / 2;
xt::noalias(res3) = tx + tw / 2;
If you have the Eigen code, we could use it to benchmark xtensor vs Eigen and make sure that we have comparable speeds.
Also assigning the load_npy to a xtensor can help:
#include <xtensor/xtensor.hpp>
xt::xtensor<float, 2> loc = xt::load_npy<float>("c:/temp/loc.npy");
xt::xtensor<float, 2> anchor = xt::load_npy<float>("c:/temp/anchor.npy");
xt::xtensor<float, 2> fg_score = xt::load_npy<float>("c:/temp/fg_score.npy");
thanks, I followed above suggestion, and the consuming time drop to 3.8ms. I'm not sure the xsimd is used properly, I only add the xsimd include path to my project, and then set the compile flag XTENSOR_USE_XSIMD.
here is my Eigen code, the test data is the same as it used in xtensor:
#include <iostream>
#include <omp.h>
#include "Eigen/Core"
#include "xtensor/xio.hpp"
#include "xtensor/xnpy.hpp"
void load_data(Eigen::MatrixXf& mat_loc, Eigen::MatrixXf& mat_anchor)
{
auto loc = xt::load_npy<float>("c:/temp/loc.npy");
auto anchor = xt::load_npy<float>("c:/temp/anchor.npy");
auto fg_score = xt::load_npy<float>("c:/temp/fg_score.npy");
size_t d0 = loc.shape()[0];
size_t d1 = loc.shape()[1];
mat_loc.setZero(d0, d1);
mat_anchor.setZero(d0, d1);
for (int i = 0; i < d0; i++)
{
for (int j = 0; j < d1; j++)
{
mat_loc(i, j) = loc(i, j);
mat_anchor(i, j) = anchor(i, j);
}
}
}
int main()
{
Eigen::MatrixXf loc;
Eigen::MatrixXf anchor;
load_data(loc, anchor);
Eigen::MatrixXf result(loc);
result.setZero();
float ts = omp_get_wtime();
for (int i = 0; i < 40; i++)
{
auto top = anchor.col(0);
auto left = anchor.col(1);
auto h = anchor.col(2) - anchor.col(0);
auto w = anchor.col(3) - anchor.col(1);
auto cy = top + h / 2;
auto cx = left + w / 2;
auto ty = h.cwiseProduct(loc.col(0)) + cy;
auto tx = w.cwiseProduct(loc.col(1)) + cx;
auto th = h.cwiseProduct(loc.col(2).array().exp().matrix());
auto tw = w.cwiseProduct(loc.col(3).array().exp().matrix());
result.col(0) = ty - th / 2;
result.col(1) = tx - tw / 2;
result.col(2) = ty + th / 2;
result.col(3) = tx + tw / 2;
}
ts = omp_get_wtime() - ts;
std::cout << ts / 40 << std::endl;
return 0;
}
Also I think Eigen default layout is column major, could you try replacing the first lines of your test function with the following ones?
using tensor_type = xt::xtensor<float, 2, xt::layout_type::column_major>;
tensor_type loc = xt::load_npy<float>("c:/temp/loc.npy");
tensor_type anchor = xt::load_npy<float>("c:/temp/anchor.npy");
tensor_type fg_score = xt::load_npy<float>("c:/temp/fg_score.npy");
xt::xarray<float, xt::layout_type::column_major> result(loc.shape(), 0);
Don't hesitate to share the latest xtensor version.
sorry I'm come back later and thank you very much for answering my questions with great patience. I tried again, I found when the variable 'result' is also change to tensor_type, and using column_major, the code get maximum speed of 0.9ms, that's great. if I use row_major, the speed is still 3.8ms.
I don't understand why column_major could be faster than row_major.
here is new code, thanks.
#include <iostream>
#include "xtensor/xarray.hpp"
#include "xtensor/xio.hpp"
#include "xtensor/xview.hpp"
#include "xtensor/xnpy.hpp"
#include "xtensor/xnoalias.hpp"
#include <omp.h>
#include <tuple>
#include <array>
#include "xtensor/xtensor.hpp"
using namespace xt::placeholders;
void test()
{
using tensor_type = xt::xtensor<float, 2, xt::layout_type::column_major>;
tensor_type loc = xt::load_npy<float>("c:/temp/loc.npy");
tensor_type anchor = xt::load_npy<float>("c:/temp/anchor.npy");
tensor_type fg_score = xt::load_npy<float>("c:/temp/fg_score.npy");
//xt::xarray<float, xt::layout_type::column_major> result({ loc.shape()[0], loc.shape()[1] }, 0);
tensor_type result(loc.shape(), 0);
int loop = 500;
float ts = omp_get_wtime();
for (int i = 0; i < loop; i++)
{
auto anc0 = xt::view(anchor, xt::all(), 0);
auto anc1 = xt::view(anchor, xt::all(), 1);
auto h = xt::view(anchor, xt::all(), 2) - anc0;
auto w = xt::view(anchor, xt::all(), 3) - anc1;
auto cy = anc0 + h / 2;
auto cx = anc1 + w / 2;
auto ty = h * xt::view(loc, xt::all(), 0) + cy;
auto tx = w * xt::view(loc, xt::all(), 1) + cx;
auto th = xt::exp(xt::view(loc, xt::all(), 2)) * h;
auto tw = xt::exp(xt::view(loc, xt::all(), 3)) * w;
auto res0 = xt::view(result, xt::all(), 0);
auto res1 = xt::view(result, xt::all(), 1);
auto res2 = xt::view(result, xt::all(), 2);
auto res3 = xt::view(result, xt::all(), 3);
xt::noalias(res0) = ty - th / 2;
xt::noalias(res1) = tx - tw / 2;
xt::noalias(res2) = ty + th / 2;
xt::noalias(res3) = tx + tw / 2;
}
ts = omp_get_wtime() - ts;
std::cout << ts / loop << std::endl;
std::cout << result << std::endl;
}
int main()
{
test();
return 0;
}
You should be able to speed that up by replacing / 2
with * 0.5f
.
The reason is that on scalar operations, the compiler would do the same with in both cases, but on arrays, it cannot optimize the division by a known quantity as a multiplication. Specifying f
for the float prevents mixed arithmetics.
The reason for column-major to be faster is that you are making operations on columns, which are memory-contiguous in column-major arrays, and allow for SIMD acceleration to kick in.
If you were manipulating rows, the row-major version would be faster.
OK, thanks ! I understand! and replacing /2
with *0.5f
even speed it up, and the f
should not be ignored.
thanks again!
You're welcome! Could you post the results of your final benchmarks?
I am curious about your final benchmark!
ok, I add some code to show the result more clearly:
ts = omp_get_wtime() - ts;
std::cout << "time: " << ts / loop << "/s" << std::endl;
std::cout << "result: " << std::endl;
std::cout << result << std::endl;
and the result shows below:
hello , I'm newer to xtensor, I really like it. yesterday, I made a test, found it is somewhat slow, below is consuming time of same code written in numpy, eigen and xtensor: numpy 1.5ms eigen 0.7ms xtensor 5.5ms
here is my code and test dada, I want to know if I made something wrong? thanks.
data0235425.zip