import math

from measure_coverage_patch import main as measure_coverage_patch, save_div
from measure_coverage_patch_multi import main as measure_coverage_patch_multi

print(",".join([
    "approach",
    "pct_applied_cases",
    "pct_solved_cases",
    "avg_recall",
    "avg_precision",
    "pct_ftp",
    "pct_etp",
    "pct_fails_initially",
    "pct_error_initially",
    "pct_patch_executable",
    "no_all_cases",
    "no_added_tests",
]))
for approach in [
    ("swt_lite_golden_test/mode_vanillafuzzy", "golden"),
    ("gpt-4-1106-preview__swt_bench_lite_aug1_bm25_diff_27k_cl100k__seed=0,temperature=0__test/mode_vanillafuzzy", "gpt4 one-shot diff"),
    ("gpt-4-1106-preview__swt_bench_lite_aug1_bm25_diff_27k_cl100k__seed=0,temperature=0__test/mode_vanillafuzzy", "gpt4 one-shot diff (fuzzy)"),
    ("gpt-4-1106-preview__swt_bench_lite_aug1_bm25_27k_cl100k__seed=0,temperature=0__test/mode_custom", "gpt4 one-shot prompt"),
    ("gpt-4-1106-preview__swt_bench_lite_aug1_bm25_27k_cl100k__seed=1,temperature=07__test/mode_custom", "gpt4 libro", "gpt-4-1106-preview__libro_gpt-4-1106-preview__swt_bench_lite_aug1__test__test.jsonl"),
    ("gpt-4-1106-preview__swt_bench_lite_aug1_bm25_27k_cl100k__seed=1,temperature=07__test/mode_custom", "gpt4 libro (ideal)", ""),
    ("acr_swt_bench_lite/mode_vanilla", "gpt4 acr"),
    ("swe-agent-demo3__swt_bench_lite__test/mode_vanilla", "gpt4 swe-agent"),
    ("swe-agent-demo4__swt_bench_lite__test/mode_vanilla", "gpt4 swe-agent"),
    # ("claude-3-haiku-20240307__swt_bench_lite_aug1_bm25_diff_27k_cl100k__seed=0,temperature=0__test/mode_vanillafuzzy", "haiku one-shot diff"),
    # ("claude-3-haiku-20240307__swt_bench_lite_aug1_bm25_diff_27k_cl100k__seed=0,temperature=0__test/mode_vanillafuzzy", "haiku one-shot diff (fuzzy)"),
    # ("claude-3-haiku-20240307__swt_bench_lite_aug1_bm25_27k_cl100k__seed=0,temperature=0__test/mode_custom", "haiku one-shot prompt"),
    # ("claude-3-haiku-20240307__swt_bench_lite_aug1_bm25_27k_cl100k__seed=0,temperature=07__test/mode_custom", "haiku libro", "claude-3-haiku-20240307__libro-claude__temperature=0__test.jsonl"),
    # ("claude-3-haiku-20240307__swt_bench_lite_aug1_bm25_27k_cl100k__seed=0,temperature=07__test/mode_custom", "haiku libro (ideal)",  ""),
    # ("swe-agent-demo3-haiku__swt_bench_lite__test/mode_vanilla", "haiku swe-agent"),
    # ("Mixtral-8x22B-Instruct-v01__swt_bench_lite_aug1_bm25_diff_27k_cl100k__temperature=0__test/mode_vanillafuzzy", "Mixtral 8x22B one-shot diff"),
    # ("Mixtral-8x22B-Instruct-v01__swt_bench_lite_aug1_bm25_diff_27k_cl100k__temperature=0__test/mode_vanillafuzzy", "Mixtral 8x22B one-shot diff (fuzzy)"),
    # ("Mixtral-8x22B-Instruct-v01__swt_bench_lite_aug1_bm25_27k_cl100k__temperature=0__test/mode_custom", "Mixtral 8x22B one-shot prompt"),
    # ("Mixtral-8x22B-Instruct-v01__swt_bench_lite_aug1_bm25_27k_cl100k__temperature=07001__test/mode_custom", "mixtral libro (ideal)", ""),
    # ("swe-agent-demo3-mixtral__swt_bench_lite__test/mode_vanilla", "mixtral swe-agent"),
    # ("gpt-4-1106-preview__swt_bench_aug1_oracle__seed=0,temperature=0__test/mode_custom", "gpt4 one-shot prompt oracle"),
    # ("gpt-4-1106-preview__swt_bench_aug1_oracle_patch__seed=0,temperature=0__test/mode_custom", "gpt4 one-shot prompt oracle + patch"),
    # ("ollama_llama3-gradient:70b__swt_bench_lite_aug1_bm25_diff_27k_cl100k__seed=0,temperature=0,max_tokens=2000__test/mode_vanillafuzzy", "ollama 70B one-shot diff"),
    # ("claude-3-opus-20240229__swt_bench_lite_aug1_bm25_diff_27k_cl100k__seed=0,temperature=0__test/mode_vanillafuzzy", "opus one-shot diff"),
]:
    eval_dir = approach[0]
    name = approach[1]
    if "=07" in eval_dir:
        libro_dir = approach[2]
        ress = measure_coverage_patch_multi(f"evaluation_output/{eval_dir}", dataset="./datasets/swt_bench_lite_aug1_bm25_diff_27k_cl100k", split="test", seeds="1,2,3,4,5", setting="IDEAL" if "ideal" in name else "LIBRO", libro_inference_results=f"inference_output/{libro_dir}")
    else:
        ress = measure_coverage_patch(f"evaluation_output/{eval_dir}", dataset="./datasets/swt_bench_lite_aug1_bm25_diff_27k_cl100k", split="test", fuzzy=True if "fuzzy" in name else False)
    non_applied_cases = [res for res in ress if res.get("message", None) is not None]
    no_all_cases = len(ress)
    applied_cases = [res for res in ress if res.get("message", None) is None]
    no_applied_cases = len(applied_cases)
    good_cases = sum(res["good_case"] for res in applied_cases)
    ftp = sum(res["ftp"] for res in applied_cases)
    etp = sum(res["etp"] for res in applied_cases)
    fails_initially = sum(res["fails_initially"] for res in applied_cases)
    error_initially = sum(res["error_initially"] for res in applied_cases)
    no_added_tests = sum(res["no_added_tests"] for res in applied_cases)
    recall = sum(res["recall"] for res in applied_cases)
    precision = sum(res["precision"] for res in applied_cases)
    patch_executable = sum(res["patch_executable"] for res in applied_cases)

    print(
        ",".join([
            name,
            str(no_applied_cases/no_all_cases),
            str(save_div(good_cases, no_all_cases, "NA")),
            str(save_div(recall, no_all_cases, "NA")),
            str(save_div(precision, no_all_cases, "NA")),
            str(save_div(ftp, no_all_cases, "NA")),
            str(save_div(etp, no_all_cases, "NA")),
            str(save_div(fails_initially, no_all_cases, "NA")),
            str(save_div(error_initially, no_all_cases, "NA")),
            str(save_div(patch_executable, no_all_cases, "NA")),
            str(no_all_cases),
            str(save_div(no_added_tests, no_applied_cases, "NA")),
        ])
    )
