tensor-compiler / taco

The Tensor Algebra Compiler (taco) computes sparse tensor expressions on CPUs and GPUs
http://tensor-compiler.org
Other
1.25k stars 188 forks source link

Direct concrete index notation schedules breaks tensor serialization order #495

Open willow-ahrens opened 2 years ago

willow-ahrens commented 2 years ago

When specifying a schedule via direct concrete index notation, TACO assumes that the postorder traversal of tensors in the index expression will match the postorder traversal of tensors in the concrete index notation. This might be true if the concrete index notation is generated via TACO scheduling commands, but it is certainly not true if the concrete index notation is specified explicitly. The offending line in TACO is roughly https://github.com/tensor-compiler/taco/blob/0ede00290e97c8409dfe0156c77790b6924decbb/src/tensor.cpp#L812, the line in compute() which chooses a tensor ordering to pass to the compiled code, which expects the tensors in a different order.

A robust solution would be to sort the tensors by their name in both the compute() function and the compiled code.

willow-ahrens commented 2 years ago

Here is an example to reproduce the problem.

File spmv2.cpp:

#include "taco/tensor.h"
#include <iostream>
#include <getopt.h>
#include <sys/stat.h>
#include <string>
#include <chrono>

template <typename Setup, typename Test>
double benchmark(double time_max, int trial_max, Setup setup, Test test){
    auto time_total = std::chrono::high_resolution_clock::duration(0);
    auto time_min = std::chrono::high_resolution_clock::duration(0);
    int trial = 0;
    while(trial < trial_max){
        setup();
        auto tic = std::chrono::high_resolution_clock::now();
        test();
        auto toc = std::chrono::high_resolution_clock::now();
        auto time = std::chrono::duration_cast<std::chrono::nanoseconds>(toc-tic);
        trial++;
        if(trial == 1 || time < time_min){
            time_min = time;
        }
        time_total += time;
        if(time_total.count() * 1e-9 > time_max){
            break;
        }
    }
    return time_min.count() * 1e-9;
}

using namespace taco;

static void usage()
{
    fprintf(stderr,
        "usage: foo [options]\n"
        "  --ntrials <arg>        Maximum number of trials to run\n"
        "  --ttrials <arg>        Maximum time to run trials\n"
        "  --help                 Display help message\n"

"--tensor_B <arg> file\n"

"--tensor_d <arg> file\n"

"--tensor_C <arg> file\n"

"--tensor_a <arg> file\n"

    );
}

int main(int argc, char **argv)
{
    int help = 0;

    int n_trials = 10000;
    double t_trials = 5.0;

std::string file_B = "";

std::string file_d = "";

std::string file_C = "";

std::string file_a = "";

    /* Beware. Option parsing below. */
    long longarg;
    double doublearg;
    struct stat statthing;
    while (1)
    {
        const char *options = "";
        const struct option long_options[] = {
            {"ntrials", required_argument, NULL, 1},
            {"ttrials", required_argument, NULL, 1},
            {"help", no_argument, &help, 1},

{"tensor_B", required_argument, NULL, 1},

{"tensor_d", required_argument, NULL, 1},

{"tensor_C", required_argument, NULL, 1},

{"tensor_a", required_argument, NULL, 1},

            {0, 0, 0, 0}};

        /* getopt_long stores the option index here. */
        int option_index = 0;

        int c = getopt_long(argc, argv, options, long_options, &option_index);

        if (c == 0){
            continue;
        }

        /* Detect the end of the options. */
        if (c == -1)
            break;

        switch (option_index) {
            case 0:
                errno = 0;
                longarg = strtol(optarg, 0, 10);
                if (errno != 0 || longarg < 1)
                {
                    printf("option --ntrials takes an integer maximum number of trials >= 1\n");
                    usage();
                    return 1;
                }
                n_trials = longarg;
                break;

            case 1:
                errno = 0;
                doublearg = strtod(optarg, 0);
                if (errno != 0 || doublearg < 0.0)
                {
                    printf("option --ttrials takes a maximum measurement time in seconds >= 0.0\n");
                    usage();
                    return 1;
                }
                t_trials = doublearg;
                break;

            case 2:
                help = 1;
                break;

case 3:

if (stat(optarg, &statthing) < 0 || !S_ISREG(statthing.st_mode))
{
    printf("argument to --tensor_B must be a file\n");
    usage();
    return 1;
}
file_B = optarg;
break;

case 4:

if (stat(optarg, &statthing) < 0 || !S_ISREG(statthing.st_mode))
{
    printf("argument to --tensor_d must be a file\n");
    usage();
    return 1;
}
file_d = optarg;
break;

case 5:

if (stat(optarg, &statthing) < 0 || !S_ISREG(statthing.st_mode))
{
    printf("argument to --tensor_C must be a file\n");
    usage();
    return 1;
}
file_C = optarg;
break;

case 6:

if (stat(optarg, &statthing) < 0 || !S_ISREG(statthing.st_mode))
{
    printf("argument to --tensor_a must be a file\n");
    usage();
    return 1;
}
file_a = optarg;
break;

            default:
                printf("unrecognized argument\n");
                usage();
                abort();
        }
    }

    if (help)
    {
        printf("Try a tensor kernel!\n");
        usage();
        return 0;
    }

    taco::setEvaluateAtAssign(false);

    // Create tensors

if(file_B == ""){
    std::cout << "oh no! There's no tensor file for B" << std::endl;
    return -1;
}
Tensor<double> tensor_B = read(file_B, Format({Dense, Sparse}), true);
TensorVar tensorvar_B = tensor_B.getTensorVar();

if(file_d == ""){
    std::cout << "oh no! There's no tensor file for d" << std::endl;
    return -1;
}
Tensor<double> tensor_d = read(file_d, Format({Dense}), true);
TensorVar tensorvar_d = tensor_d.getTensorVar();

if(file_C == ""){
    std::cout << "oh no! There's no tensor file for C" << std::endl;
    return -1;
}
Tensor<double> tensor_C = read(file_C, Format({Dense, Sparse}), true);
TensorVar tensorvar_C = tensor_C.getTensorVar();

Tensor<double> tensor_B_6432 = tensor_B.transpose({0, 1}, Format({Sparse, Sparse}));
TensorVar tensorvar_B_6432 = tensor_B_6432.getTensorVar();

Tensor<double> tensor_w_1({}, Format({}));
TensorVar tensorvar_w_1 = tensor_w_1.getTensorVar();

Tensor<double> tensor_a({tensor_B.getDimensions()[0]}, Format({Dense}));
TensorVar tensorvar_a = tensor_a.getTensorVar();

    // Form a tensor-vector multiplication expression

    IndexVar index_i_6434;

    IndexVar index_j_6433;

    IndexVar index_k;

    IndexVar index_j;

    IndexVar index_i;

    IndexVar index_foo_6435;

    tensor_a(index_foo_6435) += ((tensor_d(index_k) * tensor_C(index_j, index_k)) * tensor_B_6432(index_j, index_i));

    // Compile the expression
    tensor_a.compile(forall(index_j, where(forall(index_i, tensorvar_a(index_i) += (tensorvar_w_1() * tensorvar_B_6432(index_j, index_i))), forall(index_k, tensorvar_w_1() += (tensorvar_d(index_k) * tensorvar_C(index_j, index_k))))));

    // Assemble output indices and numerically compute the result
    auto time = benchmark(
        10, 10000, [&tensor_a]()
        { tensor_a.assemble(); 
        //tensor_a.setNeedsCompute(true);
        },
        [&tensor_a]()
        { tensor_a.compute();
         });

    std::cout << time << std::endl;

if(file_a != ""){
    write(file_a, tensor_a);
}

    return 0;
}

In a file spmv2_breaks.cpp, change the above line

tensor_a(index_foo_6435) += ((tensor_d(index_k) * tensor_C(index_j, index_k)) * tensor_B_6432(index_j, index_i));

to

tensor_a(index_foo_6435) += (tensor_C(index_j, index_k) * (tensor_d(index_k) * tensor_B_6432(index_j, index_i)));

File tensor_B.tns:

1 1 1.0
1024 1024 2.0

File tensor_C.tns:

1 1 1.0
1024 1024 2.0

File tensor_d.tns:

1 1.0
1024 2.0

Compile with

gcc -c -o spmv2_breaks.o spmv2_breaks.cpp --std=c++11 -I/data/scratch/pahrens/taco/include -I/data/scratch/pahrens/taco/src -g -ggdb -O0
gcc -o spmv2_breaks spmv2_breaks.o --std=c++11 -L/data/scratch/pahrens/taco/build/lib -g -ggdb -O0 -lm -ltaco -lstdc++

and run with

 LD_LIBRARY_PATH=/data/scratch/pahrens/taco/build/lib ./spmv2_breaks --tensor_B=tensor_B.tns --tensor_C=tensor_C.tns --tensor_d=tensor_d.tns

to obtain a very exciting segfault. Compiling and running spmv2 runs fine, as expected. It is the only correct tensor ordering, but it's not the only one that doesn't segfault. Some tensor orderings will be silently incorrect.