Better C++ codegen when passing multiple arguments as tuple

acl33 commented 5 years ago

blas_slow.ks contains code implementing BLAS routine "gemm" and a small benchmark. Execution time (after building with ksc and g++ 7.4) between 4.8-5.1s consistent over at least 20 runs on Linux VM. ksc commit a45667afe1d56be1f93de00631dd1a6f005fc1f7, haskell 8.4.4, cabal 2.4.1.0, Ubuntu 18.04.

blas_fast.ks contains the same code, but passing the 5 arguments in a tuple. Execution time between 3.8-4.1s on same VM.

blast_test2.cpp contains the c++ code generated from blas_fast.ks, but modified by hand to pass the 5 arguments separately as 5 args rather than 1 tuple. It also runs in around 4s, so argument passing is not the difference, but rather, the c++ code generated by ksc in response to the tuple, is (somehow) better....

blas_slow.ks.txt

blas_fast.ks.txt

blas_test2.cpp.txt

awf commented 5 years ago

Thanks, good issue. Could you paste the [minimized by renaming where possible] diffs between the .ks files, and the diffs between the blas_test2.cpp and blas_fast.cpp?

toelli-msft commented 5 years ago

That's very interesting. We should look at the .kso output for each case to see what substantial difference there is there.

toelli-msft commented 5 years ago

The gemm functions in the .kso files are not different from the original .ks file, as far as I can see. Furthermore I can't see any difference in the C++, besides the tupling and untupling. Does anyone know of a tool that will compare C++ code up to renamings?

acl33 commented 5 years ago

The .ks's differ only: (- for slow and + for fast):

 (def
  gemm (Vec n (Vec l Float))
- ((alpha : Float)
-  (mat_a : Vec n (Vec m Float))
-  (mat_b : Vec m (Vec l Float))
-  (beta : Float)
-  (mat_c : Vec n (Vec l Float)))
+ ((var0 : (Tuple Float (Vec n (Vec m Float)) (Vec m (Vec l Float)) Float (Vec n (Vec l Float)))))
  (let
-  ((mat_x (build n
+  ((alpha (get$1$5 var0))
+   (mat_a (get$2$5 var0))
+   (mat_b (get$3$5 var0))
+   (beta (get$4$5 var0))
+   (mat_c (get$5$5 var0))
+   (mat_x (build n

acl33 commented 5 years ago

To convert blas_fast.cpp to blas_test2.cpp (that passes multiple arguments like blas_slow, but is fast like blas_fast), I did this:

$ diff -u blas_fast.cpp blas_test2.cpp
--- blas_fast.cpp       2019-09-20 22:42:33.388238100 +0100
+++ blas_test2.cpp      2019-09-25 16:14:19.031634300 +0100
@@ -2,54 +2,54 @@
 namespace ks {

 typedef vec<vec<double>> ty$gemm;
-ty$gemm gemm(tuple<double,vec<vec<double>>,vec<vec<double>>,double,vec<vec<double>>> var0) {
+ty$gemm gemm(double alpha, vec<vec<double>> mat_a, vec<vec<double>> mat_b, double beta, vec<vec<double>> mat_c) {
 /*tup*//*std::get<0>(var0)*/
 /*tup*//*fromList [Simple "var0"]*/
-int n = size(std::get<1>(var0));
+int n = size(mat_a);
 /*fromList [Simple "n",Simple "var0"]*/
-int m = size((std::get<1>(var0))[0]);
+int m = size((mat_a)[0]);
 /*((std::get<1>(var0))[0])[0]*/
-/*tup*/KS_ASSERT(m == size(std::get<2>(var0)));
+/*tup*/KS_ASSERT(m == size(mat_b));
 /*fromList [Simple "m",Simple "n",Simple "var0"]*/
-int l = size((std::get<2>(var0))[0]);
+int l = size((mat_b)[0]);
 /*((std::get<2>(var0))[0])[0]*/
 /*tup*//*std::get<3>(var0)*/
-/*tup*/KS_ASSERT(n == size(std::get<4>(var0)));
-KS_ASSERT(l == size((std::get<4>(var0))[0]));
+/*tup*/KS_ASSERT(n == size(mat_c));
+KS_ASSERT(l == size((mat_c)[0]));
 /*((std::get<4>(var0))[0])[0]*/

 /**Let**/vec<vec<double>> c$67;
 {

 $MRK(c$1);
-double c$0 = ks_get<0>(var0);
+double c$0 = alpha;
 $REL(c$1);

 double alpha = c$0;
 /**Let**/vec<vec<double>> c$66;
 {

-vec<vec<double>> c$2 = ks_get<1>(var0);
+vec<vec<double>> c$2 = mat_a;

 vec<vec<double>> mat_a = c$2;
 /**Let**/vec<vec<double>> c$65;
 {

-vec<vec<double>> c$4 = ks_get<2>(var0);
+vec<vec<double>> c$4 = mat_b;

 vec<vec<double>> mat_b = c$4;
 /**Let**/vec<vec<double>> c$64;
 {

 $MRK(c$7);
-double c$6 = ks_get<3>(var0);
+double c$6 = beta;
 $REL(c$7);

 double beta = c$6;
 /**Let**/vec<vec<double>> c$63;
 {

-vec<vec<double>> c$8 = ks_get<4>(var0);
+vec<vec<double>> c$8 = mat_c;

 vec<vec<double>> mat_c = c$8;
 /**Let**/vec<vec<double>> c$62;
@@ -309,7 +309,7 @@
 }

-ty$gemm c$24 = gemm(std::make_tuple(1.0,c$6,c$14,1.0,c$22));
+ty$gemm c$24 = gemm(1.0,c$6,c$14,1.0,c$22);

 $MRK(c$27);
 int c$26 = size(c$24);

i.e. just replacing the argument list, and gets from var0 with the appropriate arg. Repeat, that did not affect the (fast) performance....

acl33 commented 5 years ago

However, I realize that blas_test2.cpp contains code like this (comments added):

{

$MRK(c$1);
double c$0 = alpha; // Reading argument passed in
$REL(c$1);

double alpha = c$0; // Declare new variable shadowing argument

so I removed the duplicate variables, i.e. the following diff:

@@ -21,37 +21,18 @@
 /**Let**/vec<vec<double>> c$67;
 {

-$MRK(c$1);
-double c$0 = alpha;
-$REL(c$1);
-
-double alpha = c$0;
 /**Let**/vec<vec<double>> c$66;
 {

-vec<vec<double>> c$2 = mat_a;
-
-vec<vec<double>> mat_a = c$2;
 /**Let**/vec<vec<double>> c$65;
 {

-vec<vec<double>> c$4 = mat_b;
-
-vec<vec<double>> mat_b = c$4;
 /**Let**/vec<vec<double>> c$64;
 {

-$MRK(c$7);
-double c$6 = beta;
-$REL(c$7);
-
-double beta = c$6;
 /**Let**/vec<vec<double>> c$63;
 {

-vec<vec<double>> c$8 = mat_c;
-
-vec<vec<double>> mat_c = c$8;
 /**Let**/vec<vec<double>> c$62;
 {
 /*build*/

and, lo, performance regressed back to the level of blas_slow again!!

awf commented 5 years ago

Hang on - to be clear: the just-above diff is responsible for the slowdown (or almost all of it)?

acl33 commented 5 years ago

Yes, the slowdown was indeed incurred by the last diff, as per:

so I removed the duplicate variables, i.e. the following diff: ..... and, lo, performance regressed back to the level of blas_slow again!!

Yes, this seems very odd. Need to dig into what that's doing to the gcc IR but haven't done so yet. (Possibly we should also try LLVM!)

microsoft / knossos-ksc

Better C++ codegen when passing multiple arguments as tuple #98