Skip to content

Commit

Permalink
fix get version
Browse files Browse the repository at this point in the history
  • Loading branch information
mlabonne committed Mar 28, 2024
1 parent a80cdf8 commit e3a9999
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 15 deletions.
9 changes: 4 additions & 5 deletions llm_autoeval/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,9 @@ def make_table(result_dict, task):
values = []

for k, dic in sorted(result_dict["results"].items()):
version = result_dict["versions"][k]
# Correctly use get() to safely access the dictionary
version = result_dict["versions"].get(k, "N/A") # Use get() on the versions dictionary

percent = k == "squad2"
for m, v in dic.items():
if m.endswith("_stderr"):
Expand All @@ -93,10 +95,7 @@ def make_table(result_dict, task):
# If conversion fails, use the original string value
v_formatted = v

if isinstance(version, str):
values.append([k, version, m, v_formatted, "", ""])
else:
values.append([k, version, m, v_formatted, "", ""])
values.append([k, version, m, v_formatted, "", ""])

k = ""
version = ""
Expand Down
6 changes: 6 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,19 @@ def _make_lighteval_summary(directory: str, elapsed_time: float) -> str:
summary += final_table
return summary

def _make_eqbench_summary(directory: str, elapsed_time: float) -> str:
result_dict = _get_result_dict(directory)
summary = f"## {MODEL_ID.split('/')[-1]} - {BENCHMARK.capitalize()}\n\n"
return summary

def main(directory: str, elapsed_time: float) -> None:
# Tasks
if BENCHMARK == "openllm" or BENCHMARK == "nous":
summary = _make_autoeval_summary(directory, elapsed_time)
elif BENCHMARK == "lighteval":
summary = _make_lighteval_summary(directory, elapsed_time)
elif BENCHMARK == "eq-bench":
summary = _make_eqbench_summary(directory, elapsed_time)
else:
raise NotImplementedError(
f"BENCHMARK should be 'openllm' or 'nous' (current value = {BENCHMARK})"
Expand Down
41 changes: 31 additions & 10 deletions runpod.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ if [ "$BENCHMARK" == "nous" ]; then
pip install -e .

benchmark="agieval"
echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')=================="
echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [1/4] =================="
python main.py \
--model hf-causal \
--model_args pretrained=$MODEL_ID,trust_remote_code=$TRUST_REMOTE_CODE \
Expand All @@ -47,7 +47,7 @@ if [ "$BENCHMARK" == "nous" ]; then
--output_path ./${benchmark}.json

benchmark="gpt4all"
echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')=================="
echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [2/4] =================="
python main.py \
--model hf-causal \
--model_args pretrained=$MODEL_ID,trust_remote_code=$TRUST_REMOTE_CODE \
Expand All @@ -57,7 +57,7 @@ if [ "$BENCHMARK" == "nous" ]; then
--output_path ./${benchmark}.json

benchmark="truthfulqa"
echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')=================="
echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [3/4] =================="
python main.py \
--model hf-causal \
--model_args pretrained=$MODEL_ID,trust_remote_code=$TRUST_REMOTE_CODE \
Expand All @@ -67,7 +67,7 @@ if [ "$BENCHMARK" == "nous" ]; then
--output_path ./${benchmark}.json

benchmark="bigbench"
echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')=================="
echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [4/4] =================="
python main.py \
--model hf-causal \
--model_args pretrained=$MODEL_ID,trust_remote_code=$TRUST_REMOTE_CODE \
Expand All @@ -88,7 +88,7 @@ elif [ "$BENCHMARK" == "openllm" ]; then
pip install accelerate

benchmark="arc"
echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')=================="
echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [1/6] =================="
accelerate launch -m lm_eval \
--model hf \
--model_args pretrained=${MODEL_ID},dtype=auto,trust_remote_code=$TRUST_REMOTE_CODE \
Expand All @@ -98,7 +98,7 @@ elif [ "$BENCHMARK" == "openllm" ]; then
--output_path ./${benchmark}.json

benchmark="hellaswag"
echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')=================="
echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [2/6] =================="
accelerate launch -m lm_eval \
--model hf \
--model_args pretrained=${MODEL_ID},dtype=auto,trust_remote_code=$TRUST_REMOTE_CODE \
Expand All @@ -108,7 +108,7 @@ elif [ "$BENCHMARK" == "openllm" ]; then
--output_path ./${benchmark}.json

benchmark="mmlu"
echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')=================="
echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [3/6] =================="
accelerate launch -m lm_eval \
--model hf \
--model_args pretrained=${MODEL_ID},dtype=auto,trust_remote_code=$TRUST_REMOTE_CODE \
Expand All @@ -119,7 +119,7 @@ elif [ "$BENCHMARK" == "openllm" ]; then
--output_path ./${benchmark}.json

benchmark="truthfulqa"
echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')=================="
echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [4/6] =================="
accelerate launch -m lm_eval \
--model hf \
--model_args pretrained=${MODEL_ID},dtype=auto,trust_remote_code=$TRUST_REMOTE_CODE \
Expand All @@ -129,7 +129,7 @@ elif [ "$BENCHMARK" == "openllm" ]; then
--output_path ./${benchmark}.json

benchmark="winogrande"
echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')=================="
echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [5/6] =================="
accelerate launch -m lm_eval \
--model hf \
--model_args pretrained=${MODEL_ID},dtype=auto,trust_remote_code=$TRUST_REMOTE_CODE \
Expand All @@ -139,7 +139,7 @@ elif [ "$BENCHMARK" == "openllm" ]; then
--output_path ./${benchmark}.json

benchmark="gsm8k"
echo "==================$(echo $benchmark | tr '[:lower:]' '[:upper:]')=================="
echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [6/6] =================="
accelerate launch -m lm_eval \
--model hf \
--model_args pretrained=${MODEL_ID},dtype=auto,trust_remote_code=$TRUST_REMOTE_CODE \
Expand Down Expand Up @@ -186,6 +186,27 @@ elif [ "$BENCHMARK" == "lighteval" ]; then
end=$(date +%s)

python ../llm-autoeval/main.py ./evals/results $(($end-$start))

elif [ "$BENCHMARK" == "eq-bench" ]; then
git clone https://github.com/EleutherAI/lm-evaluation-harness
cd lm-evaluation-harness
pip install -e .
pip install accelerate

benchmark="eq-bench"
echo "================== $(echo $benchmark | tr '[:lower:]' '[:upper:]') [1/6] =================="
accelerate launch -m lm_eval \
--model hf \
--model_args pretrained=${MODEL_ID},dtype=auto,trust_remote_code=$TRUST_REMOTE_CODE \
--tasks eq_bench \
--num_fewshot 0 \
--batch_size auto \
--output_path ./${benchmark}.json

end=$(date +%s)

python ../llm-autoeval/main.py ./evals/results $(($end-$start))

else
echo "Error: Invalid BENCHMARK value. Please set BENCHMARK to 'nous', 'openllm', or 'lighteval'."
fi
Expand Down

0 comments on commit e3a9999

Please sign in to comment.