Skip to content

Commit

Permalink
Merge pull request #26 from mlabonne/eqbench
Browse files Browse the repository at this point in the history
Eqbench
  • Loading branch information
mlabonne authored Apr 4, 2024
2 parents f396921 + ba55b76 commit 7f00053
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 13 deletions.
4 changes: 4 additions & 0 deletions llm_autoeval/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ def calculate_average(data, task):
elif task == "truthfulqa":
value = data["results"]["truthfulqa_mc"]["mc2"]
return 0.0 if math.isnan(value) else value * 100

elif BENCHMARK == "eq-bench":
if task == "eq-bench":
return data["results"]["eq_bench"]["eqbench,none"]

raise NotImplementedError(f"Could not find task {task} for benchmark {BENCHMARK}")

Expand Down
14 changes: 4 additions & 10 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,11 @@ def _make_autoeval_summary(directory: str, elapsed_time: float) -> str:
tasks = ["ARC", "HellaSwag", "MMLU", "TruthfulQA", "Winogrande", "GSM8K"]
elif BENCHMARK == "nous":
tasks = ["AGIEval", "GPT4All", "TruthfulQA", "Bigbench"]
elif BENCHMARK == "eq-bench":
tasks = ["EQ-Bench"]
else:
raise NotImplementedError(
f"BENCHMARK should be 'openllm' or 'nous' (current value = {BENCHMARK})"
f"The benchmark {BENCHMARK} could not be found."
)

# Load results
Expand Down Expand Up @@ -84,20 +86,12 @@ def _make_lighteval_summary(directory: str, elapsed_time: float) -> str:
return summary


def _make_eqbench_summary(directory: str, elapsed_time: float) -> str:
result_dict = _get_result_dict(directory)
summary = f"## {MODEL_ID.split('/')[-1]} - {BENCHMARK.capitalize()}\n\n"
return summary


def main(directory: str, elapsed_time: float) -> None:
# Tasks
if BENCHMARK == "openllm" or BENCHMARK == "nous":
if BENCHMARK == "openllm" or BENCHMARK == "nous" or BENCHMARK == "eq-bench":
summary = _make_autoeval_summary(directory, elapsed_time)
elif BENCHMARK == "lighteval":
summary = _make_lighteval_summary(directory, elapsed_time)
elif BENCHMARK == "eq-bench":
summary = _make_eqbench_summary(directory, elapsed_time)
else:
raise NotImplementedError(
f"BENCHMARK should be 'openllm' or 'nous' (current value = {BENCHMARK})"
Expand Down
6 changes: 3 additions & 3 deletions runpod.sh
Original file line number Diff line number Diff line change
Expand Up @@ -194,18 +194,18 @@ elif [ "$BENCHMARK" == "eq-bench" ]; then
pip install accelerate

benchmark="eq-bench"
echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [1/6] =================="
echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [1/1] =================="
accelerate launch -m lm_eval \
--model hf \
--model_args pretrained=${MODEL_ID},dtype=auto,trust_remote_code=$TRUST_REMOTE_CODE \
--tasks eq_bench \
--num_fewshot 0 \
--batch_size auto \
--output_path ./${benchmark}.json
--output_path ./evals/${benchmark}.json

end=$(date +%s)

python ../llm-autoeval/main.py ./evals/results $(($end-$start))
python ../llm-autoeval/main.py ./evals $(($end-$start))

else
echo "Error: Invalid BENCHMARK value. Please set BENCHMARK to 'nous', 'openllm', or 'lighteval'."
Expand Down

0 comments on commit 7f00053

Please sign in to comment.