microsoft / CodeXGLUE

CodeXGLUE
MIT License
1.51k stars 363 forks source link

CodeXGLUE version of BigCloneBench contains pairs that do not seem like "clone" #99

Open SCM4L opened 2 years ago

SCM4L commented 2 years ago

Dear authors,

I am interested in semantic code clones in general, and I recently found this CodeXGLUE version of BigCloneBench. I manually checked the pairs included in the train/valid/test.txt, trying to reason about some characteristics of real-world code clone, but I realize there are many pairs that are not looking like clone. It seems that such pairs are labeled as "1". I am listing a few examples below, and these examples seem common in the dataset.

Could you please explain why, in this dataset, these pairs are regarded as "clone" and how does CodeXGLUE process BigCloneBench to generate such pairs? I also checked the CodeXGLUE paper regarding the BigCloneBench, but I could not find the answer about how clone pairs are built and how they are labeled. Given that many deep learning (DL) tools (e.g., CodeBERT, GraphCodeBERT, CodeT5) are using this dataset for evaluation, it will be great if we can understand more about which aspects DL models really learns regarding these pairs that do not look like "clone".

I would greatly appreciate your answer if you can enlighten me a bit!

# test.txt --> "984683\t411595\t1"
9846843
    public byte[] getResponse() {
        final ByteArrayInputStream bais = new ByteArrayInputStream(request);
        final ByteArrayOutputStream baos = new ByteArrayOutputStream();
        List<String> lines = Collections.emptyList();
        try {
            @SuppressWarnings("unchecked") List<String> dl = IOUtils.readLines(bais);
            lines = dl;
        } catch (IOException ioex) {
            throw new AssertionError(ioex);
        }
        String resource = null;
        for (String line : lines) {
            if (line.startsWith("GET ")) {
                int endIndex = line.lastIndexOf(' ');
                resource = line.substring(4, endIndex);
            }
        }
        final PrintStream printStream = new PrintStream(baos);
        if (resource == null) {
            printStream.println("HTTP/1.1 400 Bad Request");
        } else {
            final InputStream inputStream = getClass().getResourceAsStream(resource);
            if (inputStream == null) {
                printStream.println("HTTP/1.1 404 Not Found");
                printStream.println();
            } else {
                printStream.println("HTTP/1.1 200 OK");
                printStream.println();
                try {
                    IOUtils.copy(inputStream, printStream);
                } catch (IOException ioex) {
                    throw new AssertionError(ioex);
                }
            }
        }
        printStream.flush();
        printStream.close();
        return baos.toByteArray();
    }
-------------------------------
411595
  private void displayDiffResults() throws IOException {
          File outFile = File.createTempFile("diff", ".htm");
          outFile.deleteOnExit();
          FileOutputStream outStream = new FileOutputStream(outFile);
          BufferedWriter out = new BufferedWriter(new OutputStreamWriter(outStream));
          out.write("<html><head><title>LOC Differences</title>\n" + SCRIPT + "</head>\n" + "<body bgcolor='#ffffff'>\n" + "<div onMouseOver=\"window.defaultStatus='Metrics'\">\n");
          if (addedTable.length() > 0) {
              out.write("<table border><tr><th>Files Added:</th>" + "<th>Add</th><th>Type</th></tr>");
              out.write(addedTable.toString());
              out.write("</table><br><br>");
          }
          if (modifiedTable.length() > 0) {
              out.write("<table border><tr><th>Files Modified:</th>" + "<th>Base</th><th>Del</th><th>Mod</th><th>Add</th>" + "<th>Total</th><th>Type</th></tr>");
              out.write(modifiedTable.toString());
              out.write("</table><br><br>");
          }
          if (deletedTable.length() > 0) {
              out.write("<table border><tr><th>Files Deleted:</th>" + "<th>Del</th><th>Type</th></tr>");
              out.write(deletedTable.toString());
              out.write("</table><br><br>");
          }
          out.write("<table name=METRICS BORDER>\n");
          if (modifiedTable.length() > 0 || deletedTable.length() > 0) {
              out.write("<tr><td>Base:&nbsp;</td><td>");
              out.write(Long.toString(base));
              out.write("</td></tr>\n<tr><td>Deleted:&nbsp;</td><td>");
              out.write(Long.toString(deleted));
              out.write("</td></tr>\n<tr><td>Modified:&nbsp;</td><td>");
              out.write(Long.toString(modified));
              out.write("</td></tr>\n<tr><td>Added:&nbsp;</td><td>");
              out.write(Long.toString(added));
              out.write("</td></tr>\n<tr><td>New & Changed:&nbsp;</td><td>");
              out.write(Long.toString(added + modified));
              out.write("</td></tr>\n");
          }
          out.write("<tr><td>Total:&nbsp;</td><td>");
          out.write(Long.toString(total));
          out.write("</td></tr>\n</table></div>");
          redlinesOut.close();
          out.flush();
          InputStream redlines = new FileInputStream(redlinesTempFile);
          byte[] buffer = new byte[4096];
          int bytesRead;
          while ((bytesRead = redlines.read(buffer)) != -1) outStream.write(buffer, 0, bytesRead);
          outStream.write("</BODY></HTML>".getBytes());
          outStream.close();
          Browser.launch(outFile.toURL().toString());
      }

Also

# test.txt --> "335223\t3430784\t1"
335223
    private static void readAndRewrite(File inFile, File outFile) throws IOException {
        ImageInputStream iis = ImageIO.createImageInputStream(new BufferedInputStream(new FileInputStream(inFile)));
        DcmParser dcmParser = DcmParserFactory.getInstance().newDcmParser(iis);
        Dataset ds = DcmObjectFactory.getInstance().newDataset();
        dcmParser.setDcmHandler(ds.getDcmHandler());
        dcmParser.parseDcmFile(null, Tags.PixelData);
        PixelDataReader pdReader = pdFact.newReader(ds, iis, dcmParser.getDcmDecodeParam().byteOrder, dcmParser.getReadVR());
        System.out.println("reading " + inFile + "...");
        pdReader.readPixelData(false);
        ImageOutputStream out = ImageIO.createImageOutputStream(new BufferedOutputStream(new FileOutputStream(outFile)));
        DcmEncodeParam dcmEncParam = DcmEncodeParam.IVR_LE;
        ds.writeDataset(out, dcmEncParam);
        ds.writeHeader(out, dcmEncParam, Tags.PixelData, dcmParser.getReadVR(), dcmParser.getReadLength());
        System.out.println("writing " + outFile + "...");
        PixelDataWriter pdWriter = pdFact.newWriter(pdReader.getPixelDataArray(), false, ds, out, dcmParser.getDcmDecodeParam().byteOrder, dcmParser.getReadVR());
        pdWriter.writePixelData();
        out.flush();
        out.close();
        System.out.println("done!");
    }

------------------------------------------
3430784
    public static Body decodeBody(InputStream in, String contentTransferEncoding) throws IOException {
        if (contentTransferEncoding != null) {
            contentTransferEncoding = MimeUtility.getHeaderParameter(contentTransferEncoding, null);
            if ("quoted-printable".equalsIgnoreCase(contentTransferEncoding)) {
                in = new QuotedPrintableInputStream(in);
            } else if ("base64".equalsIgnoreCase(contentTransferEncoding)) {
                in = new Base64InputStream(in);
            }
        }
        BinaryTempFileBody tempBody = new BinaryTempFileBody();
        OutputStream out = tempBody.getOutputStream();
        IOUtils.copy(in, out);
        out.close();
        return tempBody;
    }