import os
import requests
from bs4 import BeautifulSoup
import json
from tempfile import TemporaryDirectory
from tqdm.auto import tqdm
import argparse
import pandas as pd
from datasets.utils.logging import set_verbosity_error
from datasets import disable_progress_bars
from datasets import (
    load_dataset,
    load_dataset_builder,
    Dataset,
    Features,
    Value,
    Sequence,
)
from datasets.exceptions import (
    DatasetNotFoundError,
    DatasetGenerationError,
    DataFilesNotFoundError,
)
from requests.exceptions import ConnectionError
import numpy as np

set_verbosity_error()
disable_progress_bars()

# Commands to upload the scripts and necessary files
# dropbox put datasets/crawl_leaderboard.py /Easy-to-Hard/crawl_leaderboard.py
# Commands to run on remote machines
# cd workspace && wget https://github.com/dropbox/dbxcli/releases/download/v3.0.0/dbxcli-linux-amd64 && mv dbxcli-linux-amd64 dropbox && chmod +x dropbox && ./dropbox account
# ./dropbox get /Easy-to-Hard/crawl_leaderboard.py crawl_leaderboard.py && python3 -m pip install requests beautifulsoup4 pandas numpy tqdm datasets hf_transfer && mkdir -p data/Winogrande/leaderboard_performance/splits && seq 0 15 | xargs -n 1 -P 4 python3 crawl_leaderboard.py --dataset_name Winogrande --index
# watch -n1 "find ./data -type f | wc -l"


# Result dataset config names
RESULT_DATASET_CONFIG_NAMES = {
    "GSM8K": "harness_gsm8k_5",
    "ARC": "harness_arc_challenge_25",
    "Winogrande": "harness_winogrande_5",
    "HellaSwag": "harness_hellaswag_10",
}

RESULT_DATASET_LENGTH = {
    "GSM8K": 1319,
    "ARC": 1172,
    "Winogrande": 1267,
    "HellaSwag": 10042,
}

# Model performance dataset features
DATASET_FEATURES = {
    "model_name": Value("string"),
    "model_sha": Value("string"),
    "eval_timestamp": Value("string"),
    "accuracies": Sequence(Value("bool")),
}


# Get the leaderboard data from the huggingface leaderboard page
def get_leaderboard_dataframe():
    url = "https://huggingfaceh4-open-llm-leaderboard.hf.space/"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    # We can get the json format data from the second script element
    script_elements = soup.find_all("script")
    json_data = json.loads(str(script_elements[1])[31:-10])
    # component_index sometimes changes when they update the space
    # We can use this "for" loop to avoid changing component index manually
    for component_index in range(10, 50, 1):
        try:
            result_list = []
            i = 0
            while True:
                try:
                    results = json_data["components"][component_index]["props"][
                        "value"
                    ]["data"][i]
                    columns = json_data["components"][component_index]["props"][
                        "headers"
                    ]
                    try:
                        results_json = {"T": results[0], "Model": results[-1]}
                        # If there are less than 15 columns (this number can definetly change)
                        # We know that we are trying wrong component index, so breaking loop to try next component index.
                        if len(columns) < 15:
                            break
                        for col_index, col_name in enumerate(columns[2:-1], start=2):
                            results_json[col_name] = results[col_index]
                    # Wrong component index, so breaking loop to try next component index.
                    # More than one component index can give you some results but we must find the right component index to get all results we want.
                    except IndexError:
                        break
                    result_list.append(results_json)
                    i += 1
                # No rows to extract so return the list
                # We know it is the right component index because we didn't break out of loop on the other exception.
                except IndexError:
                    return pd.DataFrame(result_list)
        except (KeyError, TypeError):
            continue
    return pd.DataFrame(result_list)


# Process the evaluation results of the models
def process_eval_results(dataset_name, leaderboard):
    accuracy_dataset = []
    for model_name in tqdm(
        leaderboard["Model"],
        desc="Processing eval results: ",
    ):
        try:
            # Download the result dataset to temporary directory
            with TemporaryDirectory(dir="/tmp") as temp_dir:
                result_dataset = load_dataset(
                    "open-llm-leaderboard/details_" + model_name.replace("/", "__"),
                    RESULT_DATASET_CONFIG_NAMES[dataset_name],
                    split="latest",
                    cache_dir=temp_dir,
                )
                result_info = load_dataset_builder(
                    "open-llm-leaderboard/details_" + model_name.replace("/", "__"),
                    RESULT_DATASET_CONFIG_NAMES[dataset_name],
                    cache_dir=temp_dir,
                ).info
                # Get the accuracies of the model, they are always 0 or 1
                # Important: we must first sort the dataset by the example column first
                # so that the accuracies are in the correct order
                if "example" not in result_dataset.column_names:
                    raise KeyError("column example not in the dataset")
                result_dataset = result_dataset.sort("example")
                if "metrics" in result_dataset.column_names:
                    accuracies = np.array(
                        [metric["acc"] for metric in result_dataset["metrics"]]
                    ).astype(bool)
                elif "acc" in result_dataset.column_names:
                    accuracies = np.array(result_dataset["acc"]).astype(bool)
                else:
                    raise KeyError("column metrics or acc not in the dataset")
                # Clean up the cache files, otherwise they will accumulate
                result_dataset.cleanup_cache_files()
            # Assert the length of the accuracies is the same as the result dataset
            assert len(accuracies) == RESULT_DATASET_LENGTH[dataset_name]
            accuracy_dataset.append(
                {
                    "model_name": model_name,
                    "model_sha": leaderboard.loc[
                        leaderboard["Model"] == model_name, "Model sha"
                    ].iloc[0],
                    "eval_timestamp": sorted(
                        [
                            timestamp
                            for timestamp in result_info.splits.keys()
                            if timestamp != "latest"
                        ]
                    )[-1],
                    "accuracies": accuracies,
                }
            )
        except (
            DatasetNotFoundError,
            DatasetGenerationError,
            DataFilesNotFoundError,
            ConnectionError,
            ValueError,
            KeyError,
            IndexError,
            OSError,
            AssertionError,
        ) as e:
            # DatasetNotFoundError: the model does not have a result dataset
            # DatasetGenerationError: an error occurred while generating the dataset
            # DataFilesNotFoundError: no (supported) data files found in
            # ConnectionError: Max retries exceeded with url
            # ValueError: config name not found
            # KeyError: column example, metrics/acc not in the dataset
            # IndexError: timestamps list is empty
            # OSError: too many open files
            # AssertionError: the length of the accuracies is not the same as the result dataset
            # If error, skip it
            print(f"Error processing model {model_name}: {e}")
            continue
    # Convert the list of dictionaries to a dictionary of lists
    accuracy_dataset = {
        key: [item[key] for item in accuracy_dataset] for key in DATASET_FEATURES
    }
    # Convert your data into a Dataset, specify features explicitly if you need to control the types
    accuracy_dataset = Dataset.from_dict(
        accuracy_dataset, features=Features(DATASET_FEATURES)
    )
    return accuracy_dataset


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--dataset_name", type=str, required=True)
    parser.add_argument("--index", type=int, required=True)

    args = parser.parse_args()
    assert args.dataset_name in RESULT_DATASET_CONFIG_NAMES
    assert args.index >= 0 and args.index < 16

    leaderboard = get_leaderboard_dataframe()
    # Split the leaderboard based on model SHA hash that starts with the same hex digit
    # There are 16 hex digits, so we can split the leaderboard into 16 parts
    leaderboard = leaderboard[
        leaderboard["Model sha"].str.startswith(
            [format(i, "x") for i in range(16)][args.index]
        )
    ]
    if os.path.exists(
        f"./data/{args.dataset_name}/leaderboard_performance/splits/{args.index}.parquet"
    ):
        raise ValueError(
            f"Dataset {args.dataset_name} leaderboard performance split {args.index} already exists"
        )
    accuracy_dataset = process_eval_results(args.dataset_name, leaderboard)
    accuracy_dataset.to_parquet(
        f"./data/{args.dataset_name}/leaderboard_performance/splits/{args.index}.parquet"
    )
