{"results": {"arc_challenge": {"acc,none": 0.604, "acc_stderr,none": 0.030993197854577898, "acc_norm,none": 0.604, "acc_norm_stderr,none": 0.030993197854577898}, "arc_easy": {"acc,none": 0.832, "acc_stderr,none": 0.023692813205492536, "acc_norm,none": 0.792, "acc_norm_stderr,none": 0.025721398901416368}, "boolq": {"acc,none": 0.892, "acc_stderr,none": 0.019669559381568776}, "hellaswag": {"acc,none": 0.58, "acc_stderr,none": 0.03127799950463661, "acc_norm,none": 0.756, "acc_norm_stderr,none": 0.02721799546455311}, "mmlu": {"acc,none": 0.8088854095312653, "acc_stderr,none": 0.003734034978051547}, "mmlu_humanities": {"acc,none": 0.7634577603143419, "acc_stderr,none": 0.00819165269421013}, "mmlu_formal_logic": {"acc,none": 0.6587301587301587, "acc_stderr,none": 0.04240799327574919}, "mmlu_high_school_european_history": {"acc,none": 0.8181818181818182, "acc_stderr,none": 0.030117688929503603}, "mmlu_high_school_us_history": {"acc,none": 0.8578431372549019, "acc_stderr,none": 0.024509803921568627}, "mmlu_high_school_world_history": {"acc,none": 0.7974683544303798, "acc_stderr,none": 0.02616056824660147}, "mmlu_international_law": {"acc,none": 0.8429752066115702, "acc_stderr,none": 0.03321244842547127}, "mmlu_jurisprudence": {"acc,none": 0.8796296296296297, "acc_stderr,none": 0.03145703854306253}, "mmlu_logical_fallacies": {"acc,none": 0.8834355828220859, "acc_stderr,none": 0.02521232721050706}, "mmlu_moral_disputes": {"acc,none": 0.816, "acc_stderr,none": 0.02455581299422255}, "mmlu_moral_scenarios": {"acc,none": 0.552, "acc_stderr,none": 0.03151438761115348}, "mmlu_philosophy": {"acc,none": 0.74, "acc_stderr,none": 0.027797315752644335}, "mmlu_prehistory": {"acc,none": 0.82, "acc_stderr,none": 0.02434689065029351}, "mmlu_professional_law": {"acc,none": 0.612, "acc_stderr,none": 0.030881038748993974}, "mmlu_world_religions": {"acc,none": 0.7894736842105263, "acc_stderr,none": 0.03126781714663182}, "mmlu_other": {"acc,none": 0.8105349122090649, "acc_stderr,none": 0.007621974748052918}, "mmlu_business_ethics": {"acc,none": 0.85, "acc_stderr,none": 0.03588702812826367}, "mmlu_clinical_knowledge": {"acc,none": 0.88, "acc_stderr,none": 0.020593600596839998}, "mmlu_college_medicine": {"acc,none": 0.8208092485549133, "acc_stderr,none": 0.0292425130590633}, "mmlu_global_facts": {"acc,none": 0.5, "acc_stderr,none": 0.050251890762960605}, "mmlu_human_aging": {"acc,none": 0.8026905829596412, "acc_stderr,none": 0.02670985334496796}, "mmlu_management": {"acc,none": 0.883495145631068, "acc_stderr,none": 0.03176683948640407}, "mmlu_marketing": {"acc,none": 0.8418803418803419, "acc_stderr,none": 0.02390232554956044}, "mmlu_medical_genetics": {"acc,none": 0.89, "acc_stderr,none": 0.031446603773522014}, "mmlu_miscellaneous": {"acc,none": 0.864, "acc_stderr,none": 0.021723342617052086}, "mmlu_nutrition": {"acc,none": 0.868, "acc_stderr,none": 0.021450980824038166}, "mmlu_professional_accounting": {"acc,none": 0.704, "acc_stderr,none": 0.028928939388379697}, "mmlu_professional_medicine": {"acc,none": 0.916, "acc_stderr,none": 0.017578738526776348}, "mmlu_virology": {"acc,none": 0.5662650602409639, "acc_stderr,none": 0.03858158940685519}, "mmlu_social_sciences": {"acc,none": 0.8837719298245614, "acc_stderr,none": 0.006503914671117776}, "mmlu_econometrics": {"acc,none": 0.6491228070175439, "acc_stderr,none": 0.044895393502707}, "mmlu_high_school_geography": {"acc,none": 0.9393939393939394, "acc_stderr,none": 0.016999994927421623}, "mmlu_high_school_government_and_politics": {"acc,none": 0.9740932642487047, "acc_stderr,none": 0.011464523356953155}, "mmlu_high_school_macroeconomics": {"acc,none": 0.892, "acc_stderr,none": 0.019669559381568776}, "mmlu_high_school_microeconomics": {"acc,none": 0.9411764705882353, "acc_stderr,none": 0.015283995352038376}, "mmlu_high_school_psychology": {"acc,none": 0.972, "acc_stderr,none": 0.010454721651927283}, "mmlu_human_sexuality": {"acc,none": 0.8549618320610687, "acc_stderr,none": 0.030884661089515424}, "mmlu_professional_psychology": {"acc,none": 0.86, "acc_stderr,none": 0.021989409645240245}, "mmlu_public_relations": {"acc,none": 0.7727272727272727, "acc_stderr,none": 0.040139645540727735}, "mmlu_security_studies": {"acc,none": 0.7836734693877551, "acc_stderr,none": 0.026358916334903996}, "mmlu_sociology": {"acc,none": 0.900497512437811, "acc_stderr,none": 0.02116621630465935}, "mmlu_us_foreign_policy": {"acc,none": 0.92, "acc_stderr,none": 0.027265992434429072}, "mmlu_stem": {"acc,none": 0.7887945670628184, "acc_stderr,none": 0.007228021241323262}, "mmlu_abstract_algebra": {"acc,none": 0.62, "acc_stderr,none": 0.04878317312145634}, "mmlu_anatomy": {"acc,none": 0.7481481481481481, "acc_stderr,none": 0.037498507091740185}, "mmlu_astronomy": {"acc,none": 0.9276315789473685, "acc_stderr,none": 0.02108501126188409}, "mmlu_college_biology": {"acc,none": 0.9375, "acc_stderr,none": 0.02024219611347799}, "mmlu_college_chemistry": {"acc,none": 0.6, "acc_stderr,none": 0.0492365963917331}, "mmlu_college_computer_science": {"acc,none": 0.74, "acc_stderr,none": 0.0440844002276808}, "mmlu_college_mathematics": {"acc,none": 0.71, "acc_stderr,none": 0.045604802157206865}, "mmlu_college_physics": {"acc,none": 0.6764705882352942, "acc_stderr,none": 0.04655010411319613}, "mmlu_computer_security": {"acc,none": 0.8, "acc_stderr,none": 0.04020151261036849}, "mmlu_conceptual_physics": {"acc,none": 0.9276595744680851, "acc_stderr,none": 0.01693467695630563}, "mmlu_electrical_engineering": {"acc,none": 0.8068965517241379, "acc_stderr,none": 0.03289445522127399}, "mmlu_elementary_mathematics": {"acc,none": 0.752, "acc_stderr,none": 0.027367497504863593}, "mmlu_high_school_biology": {"acc,none": 0.94, "acc_stderr,none": 0.015050117079158739}, "mmlu_high_school_chemistry": {"acc,none": 0.8029556650246306, "acc_stderr,none": 0.02798672466673623}, "mmlu_high_school_computer_science": {"acc,none": 0.85, "acc_stderr,none": 0.03588702812826367}, "mmlu_high_school_mathematics": {"acc,none": 0.552, "acc_stderr,none": 0.03151438761115348}, "mmlu_high_school_physics": {"acc,none": 0.7748344370860927, "acc_stderr,none": 0.03410435282008936}, "mmlu_high_school_statistics": {"acc,none": 0.8333333333333334, "acc_stderr,none": 0.025416428388767443}, "mmlu_machine_learning": {"acc,none": 0.7946428571428571, "acc_stderr,none": 0.038342410214190735}, "openbookqa": {"acc,none": 0.328, "acc_stderr,none": 0.029752391824475363, "acc_norm,none": 0.472, "acc_norm_stderr,none": 0.031636489531544396}, "rte": {"acc,none": 0.82, "acc_stderr,none": 0.02434689065029351}, "winogrande": {"acc,none": 0.768, "acc_stderr,none": 0.026750070374865202}}, "groups": {"mmlu": {"acc,none": 0.8088854095312653, "acc_stderr,none": 0.003734034978051547, "alias": "mmlu"}, "mmlu_humanities": {"acc,none": 0.7634577603143419, "acc_stderr,none": 0.00819165269421013, "alias": " - humanities"}, "mmlu_other": {"acc,none": 0.8105349122090649, "acc_stderr,none": 0.007621974748052918, "alias": " - other"}, "mmlu_social_sciences": {"acc,none": 0.8837719298245614, "acc_stderr,none": 0.006503914671117776, "alias": " - social sciences"}, "mmlu_stem": {"acc,none": 0.7887945670628184, "acc_stderr,none": 0.007228021241323262, "alias": " - stem"}}, "group_subtasks": {"winogrande": [], "arc_challenge": [], "arc_easy": [], "boolq": [], "hellaswag": [], "mmlu_humanities": ["mmlu_formal_logic", "mmlu_high_school_european_history", "mmlu_high_school_us_history", "mmlu_high_school_world_history", "mmlu_international_law", "mmlu_jurisprudence", "mmlu_logical_fallacies", "mmlu_moral_disputes", "mmlu_moral_scenarios", "mmlu_philosophy", "mmlu_prehistory", "mmlu_professional_law", "mmlu_world_religions"], "mmlu_social_sciences": ["mmlu_econometrics", "mmlu_high_school_geography", "mmlu_high_school_government_and_politics", "mmlu_high_school_macroeconomics", "mmlu_high_school_microeconomics", "mmlu_high_school_psychology", "mmlu_human_sexuality", "mmlu_professional_psychology", "mmlu_public_relations", "mmlu_security_studies", "mmlu_sociology", "mmlu_us_foreign_policy"], "mmlu_other": ["mmlu_business_ethics", "mmlu_clinical_knowledge", "mmlu_college_medicine", "mmlu_global_facts", "mmlu_human_aging", "mmlu_management", "mmlu_marketing", "mmlu_medical_genetics", "mmlu_miscellaneous", "mmlu_nutrition", "mmlu_professional_accounting", "mmlu_professional_medicine", "mmlu_virology"], "mmlu_stem": ["mmlu_abstract_algebra", "mmlu_anatomy", "mmlu_astronomy", "mmlu_college_biology", "mmlu_college_chemistry", "mmlu_college_computer_science", "mmlu_college_mathematics", "mmlu_college_physics", "mmlu_computer_security", "mmlu_conceptual_physics", "mmlu_electrical_engineering", "mmlu_elementary_mathematics", "mmlu_high_school_biology", "mmlu_high_school_chemistry", "mmlu_high_school_computer_science", "mmlu_high_school_mathematics", "mmlu_high_school_physics", "mmlu_high_school_statistics", "mmlu_machine_learning"], "mmlu": ["mmlu_stem", "mmlu_other", "mmlu_social_sciences", "mmlu_humanities"], "openbookqa": [], "rte": []}, "configs": {"arc_challenge": {"task": "arc_challenge", "tag": ["ai2_arc"], "dataset_path": "allenai/ai2_arc", "dataset_name": "ARC-Challenge", "training_split": "train", "validation_split": "validation", "test_split": "test", "doc_to_text": "Question: {{question}}\nAnswer:", "doc_to_target": "{{choices.label.index(answerKey)}}", "unsafe_code": false, "doc_to_choice": "{{choices.text}}", "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": {"sampler": "default", "split": null, "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "Question: {{question}}\nAnswer:", "doc_to_choice": "{{choices.text}}", "doc_to_target": "{{choices.label.index(answerKey)}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " "}, "num_fewshot": 0, "metric_list": [{"metric": "acc", "aggregation": "mean", "higher_is_better": true}, {"metric": "acc_norm", "aggregation": "mean", "higher_is_better": true}], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": true, "doc_to_decontamination_query": "Question: {{question}}\nAnswer:", "metadata": {"version": 1.0, "pretrained": "/home/ser/reap/reap-pr17-layerwise-20260319/artifacts/Qwen3.5-35B-A3B/combined/pruned_models/layerwise_reap-renorm_true-seed_42-0.20", "base_url": "http://0.0.0.0:8000/v1/completions", "tokenized_requests": false, "num_concurrent": 8, "timeout": 1200, "max_retries": 10, "trust_remote_code": true}}, "arc_easy": {"task": "arc_easy", "tag": ["ai2_arc"], "dataset_path": "allenai/ai2_arc", "dataset_name": "ARC-Easy", "training_split": "train", "validation_split": "validation", "test_split": "test", "doc_to_text": "Question: {{question}}\nAnswer:", "doc_to_target": "{{choices.label.index(answerKey)}}", "unsafe_code": false, "doc_to_choice": "{{choices.text}}", "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": {"sampler": "default", "split": null, "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "Question: {{question}}\nAnswer:", "doc_to_choice": "{{choices.text}}", "doc_to_target": "{{choices.label.index(answerKey)}}", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " "}, "num_fewshot": 0, "metric_list": [{"metric": "acc", "aggregation": "mean", "higher_is_better": true}, {"metric": "acc_norm", "aggregation": "mean", "higher_is_better": true}], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": true, "doc_to_decontamination_query": "Question: {{question}}\nAnswer:", "metadata": {"version": 1.0, "pretrained": "/home/ser/reap/reap-pr17-layerwise-20260319/artifacts/Qwen3.5-35B-A3B/combined/pruned_models/layerwise_reap-renorm_true-seed_42-0.20", "base_url": "http://0.0.0.0:8000/v1/completions", "tokenized_requests": false, "num_concurrent": 8, "timeout": 1200, "max_retries": 10, "trust_remote_code": true}}, "boolq": {"task": "boolq", "tag": ["super-glue-lm-eval-v1"], "dataset_path": "aps/super_glue", "dataset_name": "boolq", "training_split": "train", "validation_split": "validation", "doc_to_text": "{{passage}}\nQuestion: {{question}}?\nAnswer:", "doc_to_target": "label", "unsafe_code": false, "doc_to_choice": ["no", "yes"], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": {"sampler": "default", "split": null, "process_docs": null, "fewshot_indices": null, "samples": null, "doc_to_text": "{{passage}}\nQuestion: {{question}}?\nAnswer:", "doc_to_choice": ["no", "yes"], "doc_to_target": "label", "gen_prefix": null, "fewshot_delimiter": "\n\n", "target_delimiter": " "}, "num_fewshot": 0, "metric_list": [{"metric": "acc"}], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": true, "doc_to_decontamination_query": "passage", "metadata": {"version": 2.0, "pretrained": "/home/ser/reap/reap-pr17-layerwise-20260319/artifacts/Qwen3.5-35B-A3B/combined/pruned_models/layerwise_reap-renorm_true-seed_42-0.20", "base_url": "http://0.0.0.0:8000/v1/completions", "tokenized_requests": false, "num_concurrent": 8, "timeout": 1200, "max_retries": 10, "trust_remote_code": true}}, "hellaswag": {"task": "hellaswag", "tag": ["multiple_choice"], "dataset_path": "Rowan/hellaswag", "training_split": "train", "validation_split": "validation", "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n        out_doc = {\n            \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n            \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n            \"gold\": int(doc[\"label\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n", "doc_to_text": "{{query}}", "doc_to_target": "{{label}}", "unsafe_code": false, "doc_to_choice": "choices", "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": {"sampler": "default", "split": null, "process_docs":