symflower / eval-dev-quality

DevQualityEval: An evaluation benchmark 📈 and framework to compare and evolve the quality of code generation of LLMs.
https://symflower.com/en/company/blog/2024/dev-quality-eval-v0.4.0-is-llama-3-better-than-gpt-4-for-generating-tests/
MIT License
125 stars 5 forks source link

Data visualization based on evaluation CSV files #296

Open ruiAzevedo19 opened 1 month ago

ruiAzevedo19 commented 1 month ago

Goal: create a HTML report with graphs for data visualization. Tool: D3.js library for data visualization graphs

TODO

bauersimon commented 1 month ago

Leaving this here until we have the summing logic in the visualization.

# script.sh <evaluation-without-extension> <meta-without-extension>

pip install csvkit

sed -i '1s/-/_/g' $1.csv # SQL does not like hyphens in column names.
sed -i '1s/-/_/g' $2.csv # SQL does not like hyphens in column names.

csvsql --query "SELECT model_id, language, SUM(score) AS score, SUM(coverage) AS coverage, SUM(files_executed) AS files_executed, SUM(files_executed_maximum_reachable) AS files_executed_maximum_reachable, SUM(generate_tests_for_file_character_count) AS generate_tests_for_file_character_count, SUM(processing_time) AS processing_time, SUM(response_character_count) AS response_character_count, SUM(response_no_error) AS response_no_error, SUM(response_no_excess) AS response_no_excess, SUM(response_with_code) AS response_with_code, SUM(tests_passing) AS tests_passing FROM $1 WHERE task NOT LIKE '%-symflower-fix' GROUP BY model_id, language" $1.csv > $1-by-language.csv

csvsql --query "SELECT model_id, SUM(score) AS score, SUM(CASE WHEN language = 'golang' THEN score ELSE 0 END) AS golang_score, SUM(CASE WHEN language = 'java' THEN score ELSE 0 END) AS java_score, SUM(CASE WHEN language = 'ruby' THEN score ELSE 0 END) AS ruby_score FROM $1 WHERE task NOT LIKE '%-symflower-fix' GROUP BY model_id" $1.csv > $1-by-language-score.csv

csvsql --query "SELECT $1.model_id, model_name, (completion + prompt + request) AS cost, SUM(score) AS score, SUM(coverage) AS coverage, SUM(files_executed) AS files_executed, SUM(files_executed_maximum_reachable) AS files_executed_maximum_reachable, SUM(generate_tests_for_file_character_count) AS generate_tests_for_file_character_count, SUM(processing_time) AS processing_time, SUM(response_character_count) AS response_character_count, SUM(response_no_error) AS response_no_error, SUM(response_no_excess) AS response_no_excess, SUM(response_with_code) AS response_with_code, SUM(tests_passing) AS tests_passing FROM $1 LEFT JOIN $2 ON $1.model_id = $2.model_id WHERE task NOT LIKE '%-symflower-fix' GROUP BY $1.model_id" $1.csv $2.csv > $1-total.csv

csvsql --query "SELECT model_id, task, SUM(score) AS score, SUM(coverage) AS coverage, SUM(files_executed) AS files_executed, SUM(files_executed_maximum_reachable) AS files_executed_maximum_reachable, SUM(generate_tests_for_file_character_count) AS generate_tests_for_file_character_count, SUM(processing_time) AS processing_time, SUM(response_character_count) AS response_character_count, SUM(response_no_error) AS response_no_error, SUM(response_no_excess) AS response_no_excess, SUM(response_with_code) AS response_with_code, SUM(tests_passing) AS tests_passing FROM $1 WHERE task NOT LIKE '%-symflower-fix' GROUP BY model_id, task" $1.csv > $1-by-task.csv

csvsql --query "SELECT model_id, task, language, SUM(score) AS score, SUM(coverage) AS coverage, SUM(files_executed) AS files_executed, SUM(files_executed_maximum_reachable) AS files_executed_maximum_reachable, SUM(generate_tests_for_file_character_count) AS generate_tests_for_file_character_count, SUM(processing_time) AS processing_time, SUM(response_character_count) AS response_character_count, SUM(response_no_error) AS response_no_error, SUM(response_no_excess) AS response_no_excess, SUM(response_with_code) AS response_with_code, SUM(tests_passing) AS tests_passing FROM $1 WHERE task NOT LIKE '%-symflower-fix' GROUP BY model_id, task, language" $1.csv > $1-by-task-by-language.csv

csvsql --query "SELECT model_id, SUM(CASE WHEN task NOT LIKE '%-symflower-fix' THEN score ELSE 0 END) AS score, SUM(CASE WHEN task LIKE '%-symflower-fix' THEN score ELSE 0 END) AS score_fix, SUM(CASE WHEN task NOT LIKE '%-symflower-fix' THEN files_executed ELSE 0 END) AS files_executed, SUM(CASE WHEN task LIKE '%-symflower-fix' THEN files_executed ELSE 0 END) AS files_executed_fix FROM $1 WHERE (task LIKE 'transpile%' OR task LIKE 'write-tests%') AND language = 'golang' GROUP BY model_id " $1.csv > $1-by-symflower-fix.csv