oracle / dtrace-utils

DTrace-utils contains the DTrace port to Linux
Other
132 stars 19 forks source link

Build failure with corrupted `test-triggers--usdt-tst-special.o` #98

Open thesamesam opened 2 weeks ago

thesamesam commented 2 weeks ago

(Please don't spend time on this yet as I don't really feel like I've got a clear idea of how to reproduce it or why this environment is triggering it.)

I hit this build failure via the ebuild which failed like (with a debug print added):

/var/tmp/portage/dev-debug/dtrace-9999/work/dtrace-9999/build/run-dtrace -x nolibs -G -o /var/tmp/portage/dev-debug/dtrace-9999/work/dtrace-9999/build/test-triggers--usdt-tst-special-prov.o -s test/triggers/usdt-tst-special-prov.d /var/tmp/portage/dev-debug/dtrace-9999/work/dtrace-9999/build/test-triggers--usdt-tst-special.o
process_obj: elf_update write failed
dtrace: failed to link script test/triggers/usdt-tst-special-prov.d: an error was encountered while processing /var/tmp/portage/dev-debug/dtrace-9999/work/dtrace-9999/build/test-triggers--usdt-tst-special.o
make: *** [Makerules:31: /var/tmp/portage/dev-debug/dtrace-9999/work/dtrace-9999/build/test-triggers--usdt-tst-special-prov.o] Error 1
 * ERROR: dev-debug/dtrace-9999::gentoo failed (compile phase):
 *   emake failed

Broken

~/git/dtrace-utils-bad $ make CFLAGS="-O2 -march=native -g" verbose=yes
[...]
cc -Iinclude -Iuts/common -Iinclude/i386 -I/home/sam/git/dtrace-utils-bad/build  -O2 -march=native -g -std=gnu99 -D_GNU_SOURCE -D_FILE_OFFSET_BITS=64 -D_DT_VERSION=\"2.0.1\" -fno-inline -O2 -MP -MMD -MF /home/sam/git/dtrace-utils-bad/build/test-triggers--usdt-tst-special.o.deps -MT /home/sam/git/dtrace-utils-bad/build/test-triggers--usdt-tst-special.o -c -o /home/sam/git/dtrace-utils-bad/build/test-triggers--usdt-tst-special.o test/triggers/usdt-tst-special.c
/home/sam/git/dtrace-utils-bad/build/run-dtrace -x nolibs -G -o /home/sam/git/dtrace-utils-bad/build/test-triggers--usdt-tst-special-prov.o -s test/triggers/usdt-tst-special-prov.d /home/sam/git/dtrace-utils-bad/build/test-triggers--usdt-tst-special.o
dtrace: failed to link script test/triggers/usdt-tst-special-prov.d: an error was encountered while processing /home/sam/git/dtrace-utils-bad/build/test-triggers--usdt-tst-special.o
make: *** [Makerules:31: /home/sam/git/dtrace-utils-bad/build/test-triggers--usdt-tst-special-prov.o] Error 1

or

# Trying again
$ make CFLAGS="-O2 -march=native -g" verbose=yes
ln -sf /home/sam/git/dtrace-utils-bad/build/drti.o /home/sam/git/dtrace-utils-bad/build/dlibs/drti/drti.o
if [[ -f .git/index ]]; then \
        git log --no-walk --pretty=format:%H > .git-version.tmp; \
else \
        cp .git-archive-version .git-version.tmp; \
fi
if test -r ".git-version" && cmp -s ".git-version" ".git-version.tmp"; then \
        rm -f ".git-version.tmp"; \
else \
        printf "VERSION: .git-version\n"; \
        mv -f ".git-version.tmp" ".git-version"; \
fi
bpf-unknown-none-gcc -D__amd64 -Ilibdtrace -Iinclude -I/home/sam/git/dtrace-utils-bad/build/include -idirafter /usr/include -O2 -Wall -Wno-unknown-pragmas -mcpu=v3 -masm=normal -S \
        -o - bpf/get_bvar.c | \
        awk '/dt_get_bvar:/ { \
                 getline; \
                 if(/stxdw/ && /%r6$/) { \
                     print "ERROR: bpf-unknown-none-gcc is too old"; \
                     exit(1); \
                 } \
                 exit(0); \
             }'
/home/sam/git/dtrace-utils-bad/build/run-dtrace -x nolibs -G -o /home/sam/git/dtrace-utils-bad/build/test-triggers--usdt-tst-special-prov.o -s test/triggers/usdt-tst-special-prov.d /home/sam/git/dtrace-utils-bad/build/test-triggers--usdt-tst-special.o
cc -O2 -march=native -g -std=gnu99 -D_GNU_SOURCE -D_FILE_OFFSET_BITS=64 -D_DT_VERSION=\"2.0.1\" -fno-inline -O2  -o /home/sam/git/dtrace-utils-bad/build/usdt-tst-special /home/sam/git/dtrace-utils-bad/build/test-triggers--usdt-tst-special.o /home/sam/git/dtrace-utils-bad/build/test-triggers--usdt-tst-special-prov.o -L/home/sam/git/dtrace-utils-bad/build
/usr/lib/gcc/x86_64-pc-linux-gnu/15/../../../../x86_64-pc-linux-gnu/bin/ld: warning: /home/sam/git/dtrace-utils-bad/build/test-triggers--usdt-tst-special.o has a section extending past end of file
/usr/lib/gcc/x86_64-pc-linux-gnu/15/../../../../x86_64-pc-linux-gnu/bin/ld: error: /home/sam/git/dtrace-utils-bad/build/test-triggers--usdt-tst-special.o: ELF section name out of range
collect2: error: ld returned 1 exit status
make: *** [Makerules:31: /home/sam/git/dtrace-utils-bad/build/usdt-tst-special] Error 1

Hacky debugging patch

$ git diff
diff --git a/libdtrace/dt_link.c b/libdtrace/dt_link.c
index b2148a8b..17cb8df2 100644
--- a/libdtrace/dt_link.c
+++ b/libdtrace/dt_link.c
@@ -1111,8 +1111,10 @@ process_obj(dtrace_hdl_t *dtp, const char *obj, int *eprobesp)

    scn_rel = NULL;
    while ((scn_rel = elf_nextscn(elf, scn_rel)) != NULL) {
-       if (gelf_getshdr(scn_rel, &shdr_rel) == NULL)
+       if (gelf_getshdr(scn_rel, &shdr_rel) == NULL) {
+           __builtin_printf("process_obj: gelf_getshdr(scn_rel, &shdr_rel) failed\n");
            goto err;
+                }

        /*
         * Skip any non-relocation sections.
@@ -1120,8 +1122,10 @@ process_obj(dtrace_hdl_t *dtp, const char *obj, int *eprobesp)
        if (shdr_rel.sh_type != SHT_RELA && shdr_rel.sh_type != SHT_REL)
            continue;

-       if ((data_rel = elf_getdata(scn_rel, NULL)) == NULL)
+       if ((data_rel = elf_getdata(scn_rel, NULL)) == NULL) {
+           __builtin_printf("process_obj: elf_getdata(scn_rel, NULL) failed\n");
            goto err;
+       }

        /*
         * Grab the section, section header and section data for the
@@ -1129,16 +1133,20 @@ process_obj(dtrace_hdl_t *dtp, const char *obj, int *eprobesp)
         */
        if ((scn_sym = elf_getscn(elf, shdr_rel.sh_link)) == NULL ||
            gelf_getshdr(scn_sym, &shdr_sym) == NULL ||
-           (data_sym = elf_getdata(scn_sym, NULL)) == NULL)
+           (data_sym = elf_getdata(scn_sym, NULL)) == NULL) {
+           __builtin_printf("process_obj: section grab/hdr/data failed\n");
            goto err;
+       }

        /*
         * Ditto for that symbol table's string table.
         */
        if ((scn_str = elf_getscn(elf, shdr_sym.sh_link)) == NULL ||
            gelf_getshdr(scn_str, &shdr_str) == NULL ||
-           (data_str = elf_getdata(scn_str, NULL)) == NULL)
+           (data_str = elf_getdata(scn_str, NULL)) == NULL) {
+           __builtin_printf("process_obj: string table\n");
            goto err;
+       }

        /*
         * Grab the section, section header and section data for the
@@ -1148,8 +1156,10 @@ process_obj(dtrace_hdl_t *dtp, const char *obj, int *eprobesp)
         */
        if ((scn_tgt = elf_getscn(elf, shdr_rel.sh_info)) == NULL ||
            gelf_getshdr(scn_tgt, &shdr_tgt) == NULL ||
-           (data_tgt = elf_getdata(scn_tgt, NULL)) == NULL)
+           (data_tgt = elf_getdata(scn_tgt, NULL)) == NULL) {
+           __builtin_printf("process_obj: target reloc search failed\n");
            goto err;
+       }

        /*
         * We're looking for relocations to symbols matching this form:
@@ -1200,6 +1210,7 @@ process_obj(dtrace_hdl_t *dtp, const char *obj, int *eprobesp)
            if (gelf_getsym(data_sym, GELF_R_SYM(rela.r_info),
                &rsym) == NULL) {
                dt_strtab_destroy(strtab);
+               __builtin_printf("process_obj: failed after destroy\n");
                goto err;
            }

@@ -1211,6 +1222,7 @@ process_obj(dtrace_hdl_t *dtp, const char *obj, int *eprobesp)
            if (dt_elf_symtab_lookup(data_sym, isym, rela.r_offset,
                shdr_rel.sh_info, &fsym) != 0) {
                dt_strtab_destroy(strtab);
+               __builtin_printf("process_obj: failed after destroy 2\n");
                goto err;
            }

@@ -1219,6 +1231,7 @@ process_obj(dtrace_hdl_t *dtp, const char *obj, int *eprobesp)

            if (fsym.st_name > data_str->d_size) {
                dt_strtab_destroy(strtab);
+               __builtin_printf("process_obj: failed after destroy 3\n");
                goto err;
            }

@@ -1238,6 +1251,7 @@ process_obj(dtrace_hdl_t *dtp, const char *obj, int *eprobesp)
                objkey, s) + 1;
            if ((p = dt_alloc(dtp, len)) == NULL) {
                dt_strtab_destroy(strtab);
+               __builtin_printf("process_obj: failed after destroy 4\n");
                goto err;
            }
            snprintf(p, len, dt_symfmt, dt_symprefix, objkey, s);
@@ -1270,12 +1284,15 @@ process_obj(dtrace_hdl_t *dtp, const char *obj, int *eprobesp)

            dt_strtab_destroy(strtab);

-           if ((pair = dt_alloc(dtp, sizeof(*pair))) == NULL)
+           if ((pair = dt_alloc(dtp, sizeof(*pair))) == NULL) {
+               __builtin_printf("process_obj: failed dt_alloc pair\n");
                goto err;
+           }

            if ((pair->dlp_str = dt_alloc(dtp, data_str->d_size +
                len)) == NULL) {
                dt_free(dtp, pair);
+               __builtin_printf("process_obj: failed dt_alloc pair->dlp_str\n");
                goto err;
            }

@@ -1283,6 +1300,7 @@ process_obj(dtrace_hdl_t *dtp, const char *obj, int *eprobesp)
                nsym * symsize)) == NULL) {
                dt_free(dtp, pair->dlp_str);
                dt_free(dtp, pair);
+               __builtin_printf("process_obj: failed dt_alloc pair->dlp_sym\n");
                goto err;
            }

@@ -1333,8 +1351,10 @@ process_obj(dtrace_hdl_t *dtp, const char *obj, int *eprobesp)
            ndx = GELF_R_SYM(rela.r_info);

            if (gelf_getsym(data_sym, ndx, &rsym) == NULL ||
-               rsym.st_name > data_str->d_size)
+               rsym.st_name > data_str->d_size) {
+               __builtin_printf("process_obj: failed getsym/size check\n");
                goto err;
+           }

            s = (char *)data_str->d_buf + rsym.st_name;

@@ -1358,12 +1378,16 @@ process_obj(dtrace_hdl_t *dtp, const char *obj, int *eprobesp)
                dt_dprintf("normal probe\n");
            }

-           if (*s++ != '_')
+           if (*s++ != '_') {
+               __builtin_printf("process_obj: not _\n");
                goto err;
+           }

            if ((p = strstr(s, "___")) == NULL ||
-               p - s >= sizeof(pname))
+               p - s >= sizeof(pname)) {
+               __builtin_printf("process_obj: not __ or bad size\n");
                goto err;
+           }

            memcpy(pname, s, p - s);
            pname[p - s] = '\0';
@@ -1371,11 +1395,15 @@ process_obj(dtrace_hdl_t *dtp, const char *obj, int *eprobesp)
            p = strhyphenate(p + 3); /* strlen("___") */

            if (dt_elf_symtab_lookup(data_sym, isym, rela.r_offset,
-               shdr_rel.sh_info, &fsym) != 0)
+               shdr_rel.sh_info, &fsym) != 0) {
+               __builtin_printf("process_obj: lookup failed after strhyphenate\n");
                goto err;
+           }

-           if (fsym.st_name > data_str->d_size)
+           if (fsym.st_name > data_str->d_size) {
+               __builtin_printf("process_obj: bad size check on fsym\n");
                goto err;
+           }

            assert(GELF_ST_TYPE(fsym.st_info) == STT_FUNC);

@@ -1409,8 +1437,10 @@ process_obj(dtrace_hdl_t *dtp, const char *obj, int *eprobesp)
            } else if (strncmp(s, dt_symprefix,
                strlen(dt_symprefix)) == 0) {
                r = s;
-               if ((s = strchr(s, '.')) == NULL)
+               if ((s = strchr(s, '.')) == NULL) {
+                   __builtin_printf("process_obj: bad .\n");
                    goto err;
+               }
                s++;
            }

@@ -1425,8 +1455,10 @@ process_obj(dtrace_hdl_t *dtp, const char *obj, int *eprobesp)
            assert(fsym.st_value <= rela.r_offset);

            off = rela.r_offset - fsym.st_value;
-           if (dt_modtext(dtp, data_tgt->d_buf, &rela, &off) != 0)
+           if (dt_modtext(dtp, data_tgt->d_buf, &rela, &off) != 0) {
+               __builtin_printf("process_obj: dt_modtext bad\n");
                goto err;
+           }

            if (dt_probe_define(pvp, prp, s, r, off, eprobe) != 0)
                return dt_link_error(dtp, elf, fd, bufs,
@@ -1470,8 +1502,10 @@ process_obj(dtrace_hdl_t *dtp, const char *obj, int *eprobesp)
        }
    }

-   if (mod && elf_update(elf, ELF_C_WRITE) == -1)
+   if (mod && elf_update(elf, ELF_C_WRITE) == -1) {
+       __builtin_printf("process_obj: elf_update write failed\n");
        goto err;
+   }

    elf_end(elf);
    close(fd);

Working

~/git/dtrace-utils-good $ make CFLAGS="-O2 -march=native" verbose=yes
# fine

What I don't get yet is:

This is with gcc version 15.0.0 20240826 (experimental) 92c5265d22afaac146b2a7ecbc3dac9fc3382877 (Gentoo 15.0.9999 p, commit 24f7b8a07ce29ac39d8d3245a1ba7f7abf3dcfa1) but I reproduced it with GCC 14 and 13 too.

nickalcock commented 2 weeks ago

I've seen this too, mostly when doing parallel builds of things doing dtrace -G, but never managed to track it down. I wonder if the problem is that dtrace -G modifies its inputs, as well as its outputs, so if you run dtrace -G with the same set of inputs twice at the same time... oh dear. Except you just ruled that out. It must be something else...

thesamesam commented 2 weeks ago

diffing the two trees with native+expanded, I at least managed to kill the native dependency and cvised the flags down a bit (didn't go too far because of https://github.com/oracle/dtrace-utils/issues/100):

$ make CFLAGS="-O2 -g -mno-serialize -msha -mno-shstk -mno-tbm -mno-tsxldtrk -mvaes -mwbnoinvd -mxsave -mxsavec -mxsaveopt -mxsaves -mno-amx-tile -mno-amx-int8 -mno-amx-bf16 -mno-uintr -mno-hreset -mno-kl -mno-widekl -mno-avxvnni -mno-avx512fp16 -mno-avxifma -mno-avxvnniint8 -mno-avxneconvert -mno-cmpccxadd -mno-amx-fp16 -mno-prefetchi -mno-raoint -mno-amx-complex -mno-avxvnniint16 -mno-sm3 -mno-sha512 -mno-sm4 -mno-apxf -mno-usermsr -mno-avx10.2-256 -mno-avx10.2-512" verbose=yes -j1

Not very insightful but it's better than "native fails". Pretty sure the flags can be reduced more and that it ended up getting stuck because of bad combinations. More later...