symflower / eval-dev-quality

DevQualityEval: An evaluation benchmark 📈 and framework to compare and evolve the quality of code generation of LLMs.
https://symflower.com/en/company/blog/2024/dev-quality-eval-v0.4.0-is-llama-3-better-than-gpt-4-for-generating-tests/
MIT License
128 stars 5 forks source link

Task for code transpilation, so models can transpile Go code to Java and back #226

Closed ruiAzevedo19 closed 2 months ago

ruiAzevedo19 commented 3 months ago

Part of #201

bauersimon commented 2 months ago

TODO merge https://github.com/symflower/eval-dev-quality/pull/248 after this

bauersimon commented 2 months ago

We have a case where models can achieve different coverage depending on what code they produce

Gemini Flash 1.5

package com.eval;

class IsSorted {
    static boolean isSorted(int[] a) {
        int i = 0;
        for (; i < a.length - 1 && a[i] <= a[i + 1]; i++) {
        }

        return i == a.length - 1;
    }
}

GPT-4o

package com.eval;

class IsSorted {
    static boolean isSorted(int[] a) {
        int i = 0;
        while (i < a.length - 1 && a[i] <= a[i + 1]) {
            i++;
        }
        return i == a.length - 1;
    }
}

OR

GPT-4o

package balancedBrackets

func hasBalancedBrackets(charArray string) bool {
    brackets := 0
    for _, ch := range charArray {
        if ch == '[' {
            brackets++
        } else if ch == ']' {
            brackets--
        } else {
            return false // Non-bracket characters.
        }
        if brackets < 0 { // Closing bracket before opening bracket.
            return false
        }
    }
    return brackets == 0
}

Sonnet 3.5

package balancedBrackets

func hasBalancedBrackets(charArray string) bool {
    brackets := 0
    for _, ch := range charArray {
        switch ch {
        case '[':
            brackets++
        case ']':
            brackets--
        default:
            return false // Non-bracket characters.
        }
        if brackets < 0 { // Closing bracket before opening bracket.
            return false
        }
    }
    return brackets == 0
}