Open arnocandel opened 1 year ago
mkdir ollmer
cd ollmer/
git clone https://github.com/ollmer/lm-evaluation-harness
cd lm-evaluation-harness/
git checkout mmlu_fix
diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index 233b298..3a55b3d 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -73,10 +73,10 @@ class HuggingFaceAutoLM(BaseLM):
subfolder: Optional[str] = None,
revision: Optional[str] = "main",
batch_size: Optional[Union[int, str]] = 1,
- max_gen_toks: Optional[int] = 256,
+ max_gen_toks: Optional[int] = 512,
max_length: Optional[int] = None,
add_special_tokens: Optional[bool] = None,
- use_accelerate: Optional[bool] = False,
+ use_accelerate: Optional[bool] = True,
device_map_option: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None,
@@ -85,7 +85,7 @@ class HuggingFaceAutoLM(BaseLM):
device: Optional[Union[int, str]] = "cuda",
peft: str = None,
load_in_8bit: Optional[bool] = False,
- trust_remote_code: Optional[bool] = False,
+ trust_remote_code: Optional[bool] = True,
):
"""Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation.
Args:
CUDA_VISIBLE_DEVICES=6,7 python main.py --model hf-causal-experimental --model_args pretrained=h2oai/h2ogpt-oasst1-falcon-40b --tasks hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions --device cuda &> h2ogpt-oasst1-falcon-40b.16bit.eval.hendrycks.log
h2ogpt-oasst1-falcon-40b.16bit.eval.hendrycks.log
hf-causal-experimental (pretrained=h2oai/h2ogpt-oasst1-falcon-40b), limit: None, provide_description: False, num_fewshot: 0, batch_size: None
| Task |Version| Metric |Value | |Stderr|
|-------------------------------------------------|------:|--------|-----:|---|-----:|
|hendrycksTest-abstract_algebra | 1|acc |0.3000|± |0.0461|
| | |acc_norm|0.3000|± |0.0461|
|hendrycksTest-anatomy | 1|acc |0.5481|± |0.0430|
| | |acc_norm|0.5481|± |0.0430|
|hendrycksTest-astronomy | 1|acc |0.5395|± |0.0406|
| | |acc_norm|0.5395|± |0.0406|
|hendrycksTest-business_ethics | 1|acc |0.5500|± |0.0500|
| | |acc_norm|0.5500|± |0.0500|
|hendrycksTest-clinical_knowledge | 1|acc |0.5623|± |0.0305|
| | |acc_norm|0.5623|± |0.0305|
|hendrycksTest-college_biology | 1|acc |0.5903|± |0.0411|
| | |acc_norm|0.5903|± |0.0411|
|hendrycksTest-college_chemistry | 1|acc |0.2900|± |0.0456|
| | |acc_norm|0.2900|± |0.0456|
|hendrycksTest-college_computer_science | 1|acc |0.3800|± |0.0488|
| | |acc_norm|0.3800|± |0.0488|
|hendrycksTest-college_mathematics | 1|acc |0.3400|± |0.0476|
| | |acc_norm|0.3400|± |0.0476|
|hendrycksTest-college_medicine | 1|acc |0.4798|± |0.0381|
| | |acc_norm|0.4798|± |0.0381|
|hendrycksTest-college_physics | 1|acc |0.2647|± |0.0439|
| | |acc_norm|0.2647|± |0.0439|
|hendrycksTest-computer_security | 1|acc |0.6500|± |0.0479|
| | |acc_norm|0.6500|± |0.0479|
|hendrycksTest-conceptual_physics | 1|acc |0.4340|± |0.0324|
| | |acc_norm|0.4340|± |0.0324|
|hendrycksTest-econometrics | 1|acc |0.2982|± |0.0430|
| | |acc_norm|0.2982|± |0.0430|
|hendrycksTest-electrical_engineering | 1|acc |0.4552|± |0.0415|
| | |acc_norm|0.4552|± |0.0415|
|hendrycksTest-elementary_mathematics | 1|acc |0.3519|± |0.0246|
| | |acc_norm|0.3519|± |0.0246|
|hendrycksTest-formal_logic | 1|acc |0.3095|± |0.0413|
| | |acc_norm|0.3095|± |0.0413|
|hendrycksTest-global_facts | 1|acc |0.3600|± |0.0482|
| | |acc_norm|0.3600|± |0.0482|
|hendrycksTest-high_school_biology | 1|acc |0.6613|± |0.0269|
| | |acc_norm|0.6613|± |0.0269|
|hendrycksTest-high_school_chemistry | 1|acc |0.4778|± |0.0351|
| | |acc_norm|0.4778|± |0.0351|
|hendrycksTest-high_school_computer_science | 1|acc |0.5200|± |0.0502|
| | |acc_norm|0.5200|± |0.0502|
|hendrycksTest-high_school_european_history | 1|acc |0.6485|± |0.0373|
| | |acc_norm|0.6485|± |0.0373|
|hendrycksTest-high_school_geography | 1|acc |0.6212|± |0.0346|
| | |acc_norm|0.6212|± |0.0346|
|hendrycksTest-high_school_government_and_politics| 1|acc |0.7927|± |0.0293|
| | |acc_norm|0.7927|± |0.0293|
|hendrycksTest-high_school_macroeconomics | 1|acc |0.5000|± |0.0254|
| | |acc_norm|0.5000|± |0.0254|
|hendrycksTest-high_school_mathematics | 1|acc |0.2667|± |0.0270|
| | |acc_norm|0.2667|± |0.0270|
|hendrycksTest-high_school_microeconomics | 1|acc |0.5210|± |0.0324|
| | |acc_norm|0.5210|± |0.0324|
|hendrycksTest-high_school_physics | 1|acc |0.3444|± |0.0388|
| | |acc_norm|0.3444|± |0.0388|
|hendrycksTest-high_school_psychology | 1|acc |0.7138|± |0.0194|
| | |acc_norm|0.7138|± |0.0194|
|hendrycksTest-high_school_statistics | 1|acc |0.3750|± |0.0330|
| | |acc_norm|0.3750|± |0.0330|
|hendrycksTest-high_school_us_history | 1|acc |0.6863|± |0.0326|
| | |acc_norm|0.6863|± |0.0326|
|hendrycksTest-high_school_world_history | 1|acc |0.7089|± |0.0296|
| | |acc_norm|0.7089|± |0.0296|
|hendrycksTest-human_aging | 1|acc |0.6278|± |0.0324|
| | |acc_norm|0.6278|± |0.0324|
|hendrycksTest-human_sexuality | 1|acc |0.6947|± |0.0404|
| | |acc_norm|0.6947|± |0.0404|
|hendrycksTest-international_law | 1|acc |0.6942|± |0.0421|
| | |acc_norm|0.6942|± |0.0421|
|hendrycksTest-jurisprudence | 1|acc |0.6852|± |0.0449|
| | |acc_norm|0.6852|± |0.0449|
|hendrycksTest-logical_fallacies | 1|acc |0.6933|± |0.0362|
| | |acc_norm|0.6933|± |0.0362|
|hendrycksTest-machine_learning | 1|acc |0.2946|± |0.0433|
| | |acc_norm|0.2946|± |0.0433|
|hendrycksTest-management | 1|acc |0.7573|± |0.0425|
| | |acc_norm|0.7573|± |0.0425|
|hendrycksTest-marketing | 1|acc |0.7863|± |0.0269|
| | |acc_norm|0.7863|± |0.0269|
|hendrycksTest-medical_genetics | 1|acc |0.6100|± |0.0490|
| | |acc_norm|0.6100|± |0.0490|
|hendrycksTest-miscellaneous | 1|acc |0.7548|± |0.0154|
| | |acc_norm|0.7548|± |0.0154|
|hendrycksTest-moral_disputes | 1|acc |0.6301|± |0.0260|
| | |acc_norm|0.6301|± |0.0260|
|hendrycksTest-moral_scenarios | 1|acc |0.2726|± |0.0149|
| | |acc_norm|0.2726|± |0.0149|
|hendrycksTest-nutrition | 1|acc |0.5948|± |0.0281|
| | |acc_norm|0.5948|± |0.0281|
|hendrycksTest-philosophy | 1|acc |0.6559|± |0.0270|
| | |acc_norm|0.6559|± |0.0270|
|hendrycksTest-prehistory | 1|acc |0.6574|± |0.0264|
| | |acc_norm|0.6574|± |0.0264|
|hendrycksTest-professional_accounting | 1|acc |0.4078|± |0.0293|
| | |acc_norm|0.4078|± |0.0293|
|hendrycksTest-professional_law | 1|acc |0.4166|± |0.0126|
| | |acc_norm|0.4166|± |0.0126|
|hendrycksTest-professional_medicine | 1|acc |0.5368|± |0.0303|
| | |acc_norm|0.5368|± |0.0303|
|hendrycksTest-professional_psychology | 1|acc |0.5343|± |0.0202|
| | |acc_norm|0.5343|± |0.0202|
|hendrycksTest-public_relations | 1|acc |0.6000|± |0.0469|
| | |acc_norm|0.6000|± |0.0469|
|hendrycksTest-security_studies | 1|acc |0.6327|± |0.0309|
| | |acc_norm|0.6327|± |0.0309|
|hendrycksTest-sociology | 1|acc |0.7363|± |0.0312|
| | |acc_norm|0.7363|± |0.0312|
|hendrycksTest-us_foreign_policy | 1|acc |0.8200|± |0.0386|
| | |acc_norm|0.8200|± |0.0386|
|hendrycksTest-virology | 1|acc |0.4518|± |0.0387|
| | |acc_norm|0.4518|± |0.0387|
|hendrycksTest-world_religions | 1|acc |0.7836|± |0.0316|
| | |acc_norm|0.7836|± |0.0316|
(env) arno@rippa:/nfs4/llm/h2ogpt(main)$ cat h2ogpt-oasst1-falcon-40b.16bit.eval.hendrycks.log | grep acc_norm |grep \| |sed 's/|/ /g' | awk '{print $2}' | awk '{x+=$0}END{print x/NR}' 0.541579
CUDA_VISIBLE_DEVICES=4,5 python main.py --model hf-causal-experimental --model_args pretrained=h2oai/h2ogpt-oig-oasst1-falcon-40b --tasks hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions --device cuda &> h2ogpt-oig-oasst1-falcon-40b.16bit.eval.hendrycks.log
h2ogpt-oig-oasst1-falcon-40b.16bit.eval.hendrycks.log
cat h2ogpt-oig-oasst1-falcon-40b.16bit.eval.hendrycks.log | grep acc_norm |grep \| |sed 's/|/ /g' | awk '{print $2}' | awk '{x+=$0}END{print x/NR}' 0.526419
which is in line with other evals that show that oig-oasst1 is worse than oasst1 model. https://github.com/h2oai/h2ogpt/issues/216#issuecomment-1580948272
From page 15 https://arxiv.org/pdf/2009.03300.pdf
Task Tested Concepts Supercategory
Abstract Algebra Groups, rings, fields, vector spaces, ... STEM
Anatomy Central nervous system, circulatory system, ... STEM
Astronomy Solar system, galaxies, asteroids, ... STEM
Business Ethics Corporate responsibility, stakeholders, regulation, ... Other
Clinical Knowledge Spot diagnosis, joints, abdominal examination, ... Other
College Biology Cellular structure, molecular biology, ecology, ... STEM
College Chemistry Analytical, organic, inorganic, physical, ... STEM
College Computer Science Algorithms, systems, graphs, recursion, ... STEM
College Mathematics Differential equations, real analysis, combinatorics, ... STEM
College Medicine Introductory biochemistry, sociology, reasoning, ... Other
College Physics Electromagnetism, thermodynamics, special relativity, ... STEM
Computer Security Cryptography, malware, side channels, fuzzing, ... STEM
Conceptual Physics Newton’s laws, rotational motion, gravity, sound, ... STEM
Econometrics Volatility, long-run relationships, forecasting, ... Social Sciences
Electrical Engineering Circuits, power systems, electrical drives, ... STEM
Elementary Mathematics Word problems, multiplication, remainders, rounding, ... STEM
Formal Logic Propositions, predicate logic, first-order logic, ... Humanities
Global Facts Extreme poverty, literacy rates, life expectancy, ... Other
High School Biology Natural selection, heredity, cell cycle, Krebs cycle, ... STEM
High School Chemistry Chemical reactions, ions, acids and bases, ... STEM
High School Computer Science Arrays, conditionals, iteration, inheritance, ... STEM
High School European History Renaissance, reformation, industrialization, ... Humanities
High School Geography Population migration, rural land-use, urban processes, ... Social Sciences
High School Gov’t and Politics Branches of government, civil liberties, political ideologies, ... Social Sciences
High School Macroeconomics Economic indicators, national income, international trade, ... Social Sciences
High School Mathematics Pre-algebra, algebra, trigonometry, calculus, ... STEM
High School Microeconomics Supply and demand, imperfect competition, market failure, ... Social Sciences
High School Physics Kinematics, energy, torque, fluid pressure, ... STEM
High School Psychology Behavior, personality, emotions, learning, ... Social Sciences
High School Statistics Random variables, sampling distributions, chi-square tests, ... STEM
High School US History Civil War, the Great Depression, The Great Society, ... Humanities
High School World History Ottoman empire, economic imperialism, World War I, ... Humanities
Human Aging Senescence, dementia, longevity, personality changes, ... Other
Human Sexuality Pregnancy, sexual differentiation, sexual orientation, ... Social Sciences
International Law Human rights, sovereignty, law of the sea, use of force, ... Humanities
Jurisprudence Natural law, classical legal positivism, legal realism, ... Humanities
Logical Fallacies No true Scotsman, base rate fallacy, composition fallacy, ... Humanities
Machine Learning SVMs, VC dimension, deep learning architectures, ... STEM
Management Organizing, communication, organizational structure, ... Other
Marketing Segmentation, pricing, market research, ... Other
Medical Genetics Genes and cancer, common chromosome disorders, ... Other
Miscellaneous Agriculture, Fermi estimation, pop culture, ... Other
Moral Disputes Freedom of speech, addiction, the death penalty, ... Humanities
Moral Scenarios Detecting physical violence, stealing, externalities, ... Humanities
Nutrition Metabolism, water-soluble vitamins, diabetes, ... Other
Philosophy Skepticism, phronesis, skepticism, Singer’s Drowning Child, ... Humanities
Prehistory Neanderthals, Mesoamerica, extinction, stone tools, ... Humanities
Professional Accounting Auditing, reporting, regulation, valuation, ... Other
Professional Law Torts, criminal law, contracts, property, evidence, ... Humanities
Professional Medicine Diagnosis, pharmacotherapy, disease prevention, ... Other
Professional Psychology Diagnosis, biology and behavior, lifespan development, ... Social Sciences
Public Relations Media theory, crisis management, intelligence gathering, ... Social Sciences
Security Studies Environmental security, terrorism, weapons of mass destruction, ... Social Sciences
Sociology Socialization, cities and community, inequality and wealth, ... Social Sciences
US Foreign Policy Soft power, Cold War foreign policy, isolationism, ... Social Sciences
Virology Epidemiology, coronaviruses, retroviruses, herpesviruses, ... Other
World Religions Judaism, Christianity, Islam, Buddhism, Jainism, ... Humanities
CUDA_VISIBLE_DEVICES=6,7 python main.py --model hf-causal-experimental --model_args pretrained=h2oai/h2ogpt-oasst1-512-12b --tasks hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions --device cuda &> h2ogpt-oasst1-512-12b.16bit.eval.hendrycks.log
h2ogpt-oasst1-512-12b.16bit.eval.hendrycks.log (env) arno@rippa:/nfs4/llm/h2ogpt(main)$ cat h2ogpt-oasst1-512-12b.16bit.eval.hendrycks.log | grep acc_norm |grep | |sed 's/|/ /g' | awk '{print $2}' | awk '{x+=$0}END{print x/NR}' 0.253923
CUDA_VISIBLE_DEVICES=4,5 python main.py --model hf-causal-experimental --model_args pretrained=h2oai/h2ogpt-oasst1-512-20b --tasks hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions --device cuda &> h2ogpt-oasst1-512-20b.16bit.eval.hendrycks.log
h2ogpt-oasst1-512-20b.16bit.eval.hendrycks.log
(env) arno@rippa:/nfs4/llm/h2ogpt(main)$ cat h2ogpt-oasst1-512-20b.16bit.eval.hendrycks.log | grep acc_norm |grep | |sed 's/|/ /g' | awk '{print $2}' | awk '{x+=$0}END{print x/NR}'
0.261353
CUDA_VISIBLE_DEVICES=4,5 python main.py --model hf-causal-experimental --model_args pretrained=tiiuae/falcon-40b --tasks hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions --device cuda &> falcon40.16bit.eval.hendrycks.log
falcon40.16bit.eval.hendrycks.log
(env) arno@rippa:/nfs4/llm/h2ogpt(main)$ cat falcon40.16bit.eval.hendrycks.log | grep acc_norm |grep | |sed 's/|/ /g' | awk '{print $2}' | awk '{x+=$0}END{print x/NR}'
0.541049
CUDA_VISIBLE_DEVICES=4,5 python main.py --model hf-causal-experimental --model_args pretrained=h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v2 --tasks hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions --device cuda &> h2ogpt-gm-oasst1-en-2048-falcon-7b-v2.16bit.eval.hendrycks.log
h2ogpt-gm-oasst1-en-2048-falcon-7b-v2.16bit.eval.hendrycks.log
(env) arno@rippa:/nfs4/llm/h2ogpt(main)$ cat h2ogpt-gm-oasst1-en-2048-falcon-7b-v2.16bit.eval.hendrycks.log | grep acc_norm |grep | |sed 's/|/ /g' | awk '{print $2}' | awk '{x+=$0}END{print x/NR}'
0.255867
CUDA_VISIBLE_DEVICES=4,5 python main.py --model hf-causal-experimental --model_args pretrained=tiiuae/falcon-7b --tasks hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions --device cuda &> falcon7.16bit.eval.hendrycks.log
falcon7.16bit.eval.hendrycks.log
(env) arno@rippa:/nfs4/llm/h2ogpt(main)$ cat falcon7.16bit.eval.hendrycks.log | grep acc_norm |grep | |sed 's/|/ /g' | awk '{print $2}' | awk '{x+=$0}END{print x/NR}' 0.252444
CUDA_VISIBLE_DEVICES=6,7 python main.py --model hf-causal-experimental --model_args pretrained=h2oai/h2ogpt-gm-oasst1-en-2048-falcon-40b-v1 --tasks hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions --device cuda &> h2ogpt-gm-oasst1-en-2048-falcon-40b-v1.16bit.eval.hendrycks.log
h2ogpt-gm-oasst1-en-2048-falcon-40b-v1.16bit.eval.hendrycks.log
(env) arno@rippa:/nfs4/llm/h2ogpt(main)$ cat h2ogpt-gm-oasst1-en-2048-falcon-40b-v1.16bit.eval.hendrycks.log | grep acc_norm |grep | |sed 's/|/ /g' | awk '{print $2}' | awk '{x+=$0}END{print x/NR}' 0.537337
Model | MMLU Avg |
---|---|
h2ogpt-oasst1-falcon-40b | 0.542 |
tiiuae/falcon-40b | 0.541 |
h2ogpt-gm-oasst1-en-2048-falcon-40b-v1 | 0.537 |
h2ogpt-oig-oasst1-falcon-40b | 0.526 |
h2ogpt-oasst1-512-20b | 0.261 |
h2ogpt-gm-oasst1-en-2048-falcon-7b-v2 | 0.256 |
h2ogpt-oasst1-512-12b | 0.254 |
tiiuae/falcon-7b | 0.252 |
https://github.com/hendrycks/test/pull/13 https://github.com/EleutherAI/lm-evaluation-harness/pull/497
Want to add Falcon 40B here: