Open ruiAzevedo19 opened 3 months ago
Leaving this here until we have the summing logic in the visualization.
# script.sh <evaluation-without-extension> <meta-without-extension>
pip install csvkit
sed -i '1s/-/_/g' $1.csv # SQL does not like hyphens in column names.
sed -i '1s/-/_/g' $2.csv # SQL does not like hyphens in column names.
csvsql --query "SELECT model_id, language, SUM(score) AS score, SUM(coverage) AS coverage, SUM(files_executed) AS files_executed, SUM(files_executed_maximum_reachable) AS files_executed_maximum_reachable, SUM(generate_tests_for_file_character_count) AS generate_tests_for_file_character_count, SUM(processing_time) AS processing_time, SUM(response_character_count) AS response_character_count, SUM(response_no_error) AS response_no_error, SUM(response_no_excess) AS response_no_excess, SUM(response_with_code) AS response_with_code, SUM(tests_passing) AS tests_passing FROM $1 WHERE task NOT LIKE '%-symflower-fix' GROUP BY model_id, language" $1.csv > $1-by-language.csv
csvsql --query "SELECT model_id, SUM(score) AS score, SUM(CASE WHEN language = 'golang' THEN score ELSE 0 END) AS golang_score, SUM(CASE WHEN language = 'java' THEN score ELSE 0 END) AS java_score, SUM(CASE WHEN language = 'ruby' THEN score ELSE 0 END) AS ruby_score FROM $1 WHERE task NOT LIKE '%-symflower-fix' GROUP BY model_id" $1.csv > $1-by-language-score.csv
csvsql --query "SELECT $1.model_id, model_name, (completion + prompt + request) AS cost, SUM(score) AS score, SUM(coverage) AS coverage, SUM(files_executed) AS files_executed, SUM(files_executed_maximum_reachable) AS files_executed_maximum_reachable, SUM(generate_tests_for_file_character_count) AS generate_tests_for_file_character_count, SUM(processing_time) AS processing_time, SUM(response_character_count) AS response_character_count, SUM(response_no_error) AS response_no_error, SUM(response_no_excess) AS response_no_excess, SUM(response_with_code) AS response_with_code, SUM(tests_passing) AS tests_passing FROM $1 LEFT JOIN $2 ON $1.model_id = $2.model_id WHERE task NOT LIKE '%-symflower-fix' GROUP BY $1.model_id" $1.csv $2.csv > $1-total.csv
csvsql --query "SELECT model_id, task, SUM(score) AS score, SUM(coverage) AS coverage, SUM(files_executed) AS files_executed, SUM(files_executed_maximum_reachable) AS files_executed_maximum_reachable, SUM(generate_tests_for_file_character_count) AS generate_tests_for_file_character_count, SUM(processing_time) AS processing_time, SUM(response_character_count) AS response_character_count, SUM(response_no_error) AS response_no_error, SUM(response_no_excess) AS response_no_excess, SUM(response_with_code) AS response_with_code, SUM(tests_passing) AS tests_passing FROM $1 WHERE task NOT LIKE '%-symflower-fix' GROUP BY model_id, task" $1.csv > $1-by-task.csv
csvsql --query "SELECT model_id, task, language, SUM(score) AS score, SUM(coverage) AS coverage, SUM(files_executed) AS files_executed, SUM(files_executed_maximum_reachable) AS files_executed_maximum_reachable, SUM(generate_tests_for_file_character_count) AS generate_tests_for_file_character_count, SUM(processing_time) AS processing_time, SUM(response_character_count) AS response_character_count, SUM(response_no_error) AS response_no_error, SUM(response_no_excess) AS response_no_excess, SUM(response_with_code) AS response_with_code, SUM(tests_passing) AS tests_passing FROM $1 WHERE task NOT LIKE '%-symflower-fix' GROUP BY model_id, task, language" $1.csv > $1-by-task-by-language.csv
csvsql --query "SELECT model_id, SUM(CASE WHEN task NOT LIKE '%-symflower-fix' THEN score ELSE 0 END) AS score, SUM(CASE WHEN task LIKE '%-symflower-fix' THEN score ELSE 0 END) AS score_fix, SUM(CASE WHEN task NOT LIKE '%-symflower-fix' THEN files_executed ELSE 0 END) AS files_executed, SUM(CASE WHEN task LIKE '%-symflower-fix' THEN files_executed ELSE 0 END) AS files_executed_fix FROM $1 WHERE (task LIKE 'transpile%' OR task LIKE 'write-tests%') AND language = 'golang' GROUP BY model_id " $1.csv > $1-by-symflower-fix.csv
Goal: create a HTML report with graphs for data visualization. Tool: D3.js library for data visualization graphs
TODO
[x] Create a table for the evaluation CSV file
[ ] Scatter plot
MetaInformation
function that returns a model's meta information[ ] error bars over multiple runs, to show variance