Holistic Evaluation of Language Models (HELM), a framework to increase the transparency of language models (https://arxiv.org/abs/2211.09110). This framework is also used to evaluate text-to-image models in HEIM (https://arxiv.org/abs/2311.04287) and vision-language models in VHELM (https://arxiv.org/abs/2410.07112).
Error when running babi_qa:model=limited_functionality,task=3:
Traceback (most recent call last):
File "/juice/scr/nlp/crfm/benchmarking/benchmarking/src/benchmark/presentation/present.py", line 118, in run
new_run_specs = run_benchmarking(
File "/juice/scr/nlp/crfm/benchmarking/benchmarking/src/benchmark/run.py", line 69, in run_benchmarking
runner.run_all()
File "/juice/scr/nlp/crfm/benchmarking/benchmarking/src/benchmark/runner.py", line 75, in run_all
self.run_one(run_spec)
File "/juice/scr/nlp/crfm/benchmarking/benchmarking/src/benchmark/runner.py", line 99, in run_one
scenario_state = self.executor.execute(scenario_state)
File "/juice/scr/nlp/crfm/benchmarking/benchmarking/src/common/hierarchical_logger.py", line 104, in wrapper
return fn(*args, **kwargs)
File "/juice/scr/nlp/crfm/benchmarking/benchmarking/src/benchmark/executor.py", line 84, in execute
request_states = list(
File "/u/nlp/anaconda/main/anaconda3/envs/crfm_benchmarking/lib/python3.8/site-packages/tqdm/std.py", line 1195, in __iter__
for obj in iterable:
File "/u/nlp/anaconda/main/anaconda3/envs/crfm_benchmarking/lib/python3.8/concurrent/futures/_base.py", line 619, in result_iterator
yield fs.pop().result()
File "/u/nlp/anaconda/main/anaconda3/envs/crfm_benchmarking/lib/python3.8/concurrent/futures/_base.py", line 444, in result
return self.__get_result()
File "/u/nlp/anaconda/main/anaconda3/envs/crfm_benchmarking/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result
raise self._exception
File "/u/nlp/anaconda/main/anaconda3/envs/crfm_benchmarking/lib/python3.8/concurrent/futures/thread.py", line 57, in run
result = self.fn(*self.args, **self.kwargs)
File "/juice/scr/nlp/crfm/benchmarking/benchmarking/src/benchmark/executor.py", line 77, in process
result: RequestResult = self.remote_service.make_request(self.execution_spec.auth, state.request)
File "/juice/scr/nlp/crfm/benchmarking/benchmarking/src/proxy/remote_service.py", line 47, in make_request
RemoteService._check_response(response)
File "/juice/scr/nlp/crfm/benchmarking/benchmarking/src/proxy/remote_service.py", line 29, in _check_response
raise RemoteServiceError(response["error"])
proxy.remote_service.RemoteServiceError: Failed to make request to anthropic after retrying 5 times. Error: Anthropic error: Server 'stanford-online-all-v4-s3' not found all attempts to connect to stanford-online-all-v4-s3.default.svc.cluster.local:5000 failed
Error when running news_qa:model=limited_functionality,data_augmentation=all:
Traceback (most recent call last):
File "/juice/scr/nlp/crfm/benchmarking/benchmarking/src/benchmark/presentation/present.py", line 118, in run
new_run_specs = run_benchmarking(
File "/juice/scr/nlp/crfm/benchmarking/benchmarking/src/benchmark/run.py", line 69, in run_benchmarking
runner.run_all()
File "/juice/scr/nlp/crfm/benchmarking/benchmarking/src/benchmark/runner.py", line 75, in run_all
self.run_one(run_spec)
File "/juice/scr/nlp/crfm/benchmarking/benchmarking/src/benchmark/runner.py", line 99, in run_one
scenario_state = self.executor.execute(scenario_state)
File "/juice/scr/nlp/crfm/benchmarking/benchmarking/src/common/hierarchical_logger.py", line 104, in wrapper
return fn(*args, **kwargs)
File "/juice/scr/nlp/crfm/benchmarking/benchmarking/src/benchmark/executor.py", line 84, in execute
request_states = list(
File "/u/nlp/anaconda/main/anaconda3/envs/crfm_benchmarking/lib/python3.8/site-packages/tqdm/std.py", line 1195, in __iter__
for obj in iterable:
File "/u/nlp/anaconda/main/anaconda3/envs/crfm_benchmarking/lib/python3.8/concurrent/futures/_base.py", line 619, in result_iterator
yield fs.pop().result()
File "/u/nlp/anaconda/main/anaconda3/envs/crfm_benchmarking/lib/python3.8/concurrent/futures/_base.py", line 444, in result
return self.__get_result()
File "/u/nlp/anaconda/main/anaconda3/envs/crfm_benchmarking/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result
raise self._exception
File "/u/nlp/anaconda/main/anaconda3/envs/crfm_benchmarking/lib/python3.8/concurrent/futures/thread.py", line 57, in run
result = self.fn(*self.args, **self.kwargs)
File "/juice/scr/nlp/crfm/benchmarking/benchmarking/src/benchmark/executor.py", line 77, in process
result: RequestResult = self.remote_service.make_request(self.execution_spec.auth, state.request)
File "/juice/scr/nlp/crfm/benchmarking/benchmarking/src/proxy/remote_service.py", line 47, in make_request
RemoteService._check_response(response)
File "/juice/scr/nlp/crfm/benchmarking/benchmarking/src/proxy/remote_service.py", line 29, in _check_response
raise RemoteServiceError(response["error"])
proxy.remote_service.RemoteServiceError: Failed to make request to anthropic after retrying 5 times. Error: Anthropic error: Server 'stanford-online-all-v4-s3' not found all attempts to connect to stanford-online-all-v4-s3.default.svc.cluster.local:5000 failed