Xilinx / Vitis_Libraries

Vitis Libraries
https://docs.xilinx.com/r/en-US/Vitis_Libraries
Apache License 2.0
899 stars 357 forks source link

An error occurred while testing the data_analytics library. #65

Closed BiscuitsColonel closed 3 years ago

BiscuitsColonel commented 3 years ago

Hi,

I'm currently testing the data_analytics library.

I tested it using the attached program. In most cases the test will work, but if I test with the following steps, it will not work.

# cat test.log 
pineapple
green apple
apple

# ./test.exe -xclbin reEngineKernel.xclbin -in test.log -out result.log -pt app
----------------------log analytics with regex----------------
INFO: selected device xilinx_u280_xdma_201920_3
INFO: initilized context.
INFO: initilized command queue.
INFO: created program with binary reEngineKernel.xclbin
INFO: built program.
The log file is partition into 1 section with max_slice_lnm 5 and  takes 0.002000 ms.
DEBUG: reEngineKernel has 8 CU(s)
regex pipelined, time: 1.298 ms, size: 3.05176e-05 MB, throughput: 2.29602e-05 GB/s
-----------------------------Finished regex pipelined test----------------------------------------------

ERROR: undefined error code
ERROR: result mismatch

# cat result.log 
Mismatch
0: [-1, -1]
Mismatch
0: [-1, -1]
Match
0: [0, 3]

I would appreciate it if you could check the above results and tell me how to solve it.

Thanks.

BiscuitsColonel commented 3 years ago
/*
 * Copyright 2020 Xilinx, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#include <cstdlib>
#include <fstream>

#include "xf_data_analytics/text/regex_engine.hpp"
#include "general_config.hpp"

// for validating result.
extern "C" {
#include "oniguruma.h"
}

enum {
    MAX_MSG_DEPTH = 250000000,   // Max number of messages in a section
    MAX_MSG_LEN = 65536,         // Max length of message in byte
    MAX_LNM = 6000000,           // Max number of lines in a single section
    MAX_OUT_DEPTH = MAX_LNM * 20 // 20 for 19 capturing groups at most
};
int check_result(std::string pattern,
                 uint64_t* msg_buff,
                 uint32_t* offt_buff,
                 uint16_t* len_buff,
                 uint32_t* out_buff,
                 uint32_t lnm,
                 uint32_t cpgp_nm) {
    int r;
    unsigned char *start, *range, *end;
    regex_t* reg;
    OnigErrorInfo einfo;
    OnigRegion* region = onig_region_new();
    OnigEncoding use_encs[1];

    use_encs[0] = ONIG_ENCODING_ASCII;
    onig_initialize(use_encs, sizeof(use_encs) / sizeof(use_encs[0]));

    UChar* pattern_c = (UChar*)pattern.c_str();

    r = onig_new(&reg, pattern_c, pattern_c + strlen((char*)pattern_c), ONIG_OPTION_DEFAULT, ONIG_ENCODING_ASCII,
                 ONIG_SYNTAX_DEFAULT, &einfo);
    if (r != ONIG_NORMAL) {
        char s[ONIG_MAX_ERROR_MESSAGE_LEN];
        onig_error_code_to_str((UChar*)s, r, &einfo);
        fprintf(stderr, "ERROR: %s\n", s);
        return -1;
    }
    unsigned char* max_str = (unsigned char*)malloc(MAX_MSG_LEN);
    for (int i = 0; i < lnm; ++i) {
        // generate referecne
        int offt = offt_buff[i];
        memcpy(max_str, &msg_buff[offt], len_buff[i]);
        max_str[len_buff[i]] = '\0';
        UChar* str = (UChar*)max_str;
        end = str + strlen((char*)str);
        start = str;
        range = end;
        r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE);
        // compare with actual result
        // match
        if (r == 0) {
            if (out_buff[i * (cpgp_nm + 1)] == 1) {
                for (int j = 0; j < cpgp_nm; ++j) {
                    if (region->beg[j] != out_buff[i * (cpgp_nm + 1) + j + 1] % 65536 ||
                        region->end[j] != out_buff[i * (cpgp_nm + 1) + j + 1] / 65536) {
                        fprintf(stderr, "ERROR: msg: %d, capture group: %d, ref:[%d, %d], act:[%d, %d]\n", i, j,
                                region->beg[j], region->end[j], out_buff[i * (cpgp_nm + 1) + j + 1] % 65536,
                                out_buff[i * (cpgp_nm + 1) + j + 1] / 65536);
                        return -1;
                    }
                }
            } else {
                fprintf(stderr, "ERROR: msg: %d, ref: %d, act: %d\n", i, 1, out_buff[i * (cpgp_nm + 1)]);
                return -1;
            }
            // mismatch
        } else if (r == ONIG_MISMATCH) {
            if (out_buff[i * (cpgp_nm + 1)] != 0) {
                fprintf(stderr, "ERROR: msg: %d, ref: %d, act: %d\n", i, 0, out_buff[i * (cpgp_nm + 1)]);
                return -1;
            }
        } else {
            char s[ONIG_MAX_ERROR_MESSAGE_LEN];
            onig_error_code_to_str((UChar*)s, r);
            fprintf(stderr, "ERROR: %s\n", s);
            onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
            onig_free(reg);
            onig_end();
            return -1;
        }
    }
    onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
    onig_free(reg);
    onig_end();
    return 0;
}
void store_dat(std::ofstream& out_file, uint32_t* out_buff, uint32_t lnm, uint32_t cpgp_nm) {
    typedef union {
        int16_t int_a[2];
        uint32_t d;
    } uint32_un;
    for (unsigned int i = 0; i < lnm; ++i) {
        for (unsigned int j = 0; j < cpgp_nm + 1; ++j) {
            if (i * (cpgp_nm + 1) + j < MAX_OUT_DEPTH) {
                uint32_t out = out_buff[i * (cpgp_nm + 1) + j];
                // match result
                if (j == 0) {
                    if (out == 0)
                        out_file << "Mismatch\n";
                    else
                        out_file << "Match\n";
                } else {
                    // offset of capture group
                    uint32_un tmp;
                    tmp.d = out;
                    out_file << j - 1 << ": [" << tmp.int_a[0] << ", " << tmp.int_a[1] << "]\n";
                }
            }
        }
    }
}
int load_dat(
    std::ifstream& log_file, uint64_t* msg_buff, uint32_t* offt_buff, uint16_t* len_buff, uint32_t& lnm, int limit_ln) {
    typedef union {
        char c_a[8];
        uint64_t d;
    } uint64_un;

    lnm = 0;
    std::string line;
    uint32_t offt = 0;
    while (!log_file.eof() && (offt < (MAX_MSG_DEPTH - MAX_MSG_LEN / 8)) && lnm < MAX_LNM &&
           (limit_ln == -1 || lnm < limit_ln)) {
        getline(log_file, line);
        size_t sz = line.size();
        // max line
        if (sz >= MAX_MSG_LEN) {
            std::cerr << "ERROR: length of line exceeds " << MAX_MSG_LEN << ".\n";
            return -1;
            // ignore empty line
        } else if (sz > 0) {
            offt_buff[lnm] = offt;
            len_buff[lnm] = sz;
            for (int i = 0; i < (sz + 7) / 8; ++i) {
                uint64_un out;
                for (unsigned int j = 0; j < 8; ++j) {
                    if (i * 8 + j < sz) {
                        out.c_a[j] = line[i * 8 + j];
                    } else {
                        out.c_a[j] = ' ';
                    }
                }
                msg_buff[offt++] = out.d;
            }
            lnm++;
        }
    }
    // one more
    offt_buff[lnm] = offt;
    return 0;
}
int main(int argc, const char* argv[]) {
    std::cout << "----------------------log analytics with regex----------------" << std::endl;
    // command argument parser
    // TODO use new argument parser from Utility library.
    xf::data_analytics::text::details::ArgParser parser(argc, argv);

    std::string xclbin_path;
    if (!parser.getCmdOption("-xclbin", xclbin_path)) {
        std::cout << "ERROR: xclbin path is not set!\n";
        return 1;
    }
    std::string log_path;
    if (!parser.getCmdOption("-in", log_path)) {
        std::cout << "ERROR:  input log path is not specified.\n";
        return 1;
    }
    std::string out_path;
    if (!parser.getCmdOption("-out", out_path)) {
        std::cout << "ERROR:  output path is not specified.\n";
        return 1;
    }
    std::string ln_nm;
    int limit_ln = -1;
    if (parser.getCmdOption("-lnm", ln_nm)) {
        try {
            limit_ln = std::stoi(ln_nm);
        } catch (...) {
            limit_ln = -1;
        }
    }
    std::string pattern;
    if (!parser.getCmdOption("-pt", pattern)) {
        std::cout << "ERROR:  pattern is not specified.\n";
    return 1;
    }

    // allocate the in-memory buffer
    xf::data_analytics::text::details::MM mm;
    uint64_t* msg_buff = mm.aligned_alloc<uint64_t>(MAX_MSG_DEPTH);
    uint32_t* offt_buff = mm.aligned_alloc<uint32_t>(MAX_LNM);
    uint16_t* len_buff = mm.aligned_alloc<uint16_t>(MAX_LNM);
    uint32_t* out_buff = mm.aligned_alloc<uint32_t>(MAX_OUT_DEPTH);
    // constructor of reEngine
    xf::data_analytics::text::re::RegexEngine reInst(xclbin_path, 0,                          // device config
                                                     INSTR_DEPTH, CCLASS_NM, CPGP_NM, MSG_SZ, // re limits
                                                     SLICE_SZ, SLICE_NM);                     // prcessing
    xf::data_analytics::text::re::ErrCode err_code;
    // compile pattern
    err_code = reInst.compile(pattern);
    if (err_code != 0) return -1;

    // get capture group number
    uint16_t cpgp_nm = reInst.getCpgpNm();

    // load data from disk to in-memory buffer
    std::ifstream log_file(log_path);
    std::ofstream out_file(out_path);
    uint32_t lnm = 0;
    if (!log_file.is_open()) {
        std::cerr << "ERROR: " << log_path << " cannot be opened for read.\n";
        return -1;
    }
    if (!out_file.is_open()) {
        std::cerr << "ERROR: " << out_path << " cannot be opened for write.\n";
        return -1;
    }
    while (!log_file.eof() && (limit_ln == -1 || lnm < limit_ln)) {
        // load data
        if (load_dat(log_file, msg_buff, offt_buff, len_buff, lnm, limit_ln) == -1) {
            return -1;
        }
        if (lnm > 0) {
            // call reInst to do regex
            err_code = reInst.match(lnm, msg_buff, offt_buff, len_buff, out_buff);
            if (err_code) {
                std::cerr << "ERROR: match failed.\n";
                return -1;
            }
            // write data to disk
            store_dat(out_file, out_buff, lnm, cpgp_nm);
            // if check is open, check the result with golden
            int r = check_result(pattern, msg_buff, offt_buff, len_buff, out_buff, lnm, cpgp_nm);
            if (r == -1) {
                fprintf(stderr, "ERROR: result mismatch\n");
            } else {
                fprintf(stdout, "SUCCESS: result match\n");
            }
        }
    }
    log_file.close();
    out_file.close();
    std::quick_exit(0);
    return 0;
}
vt-lib-support commented 3 years ago

Hi,

Thank you for bringing this to our attention. We are implementing the regex match behavior similar to Pytyon's re.match, which "return a corresponding match object if zero or more characters at the beginning of string match this regular expression".

So pattern app will match apple but will not match pineapple.

We will improve documentation and fix our test case to make this more clear in next release.

Thanks!

BiscuitsColonel commented 3 years ago

I understand the function. Thanks.