import os
import re
import json
import hashlib

def load_json(path, create=False, content=None):
    if os.path.exists(path):
        with open(path, 'r') as file:
            return json.load(file)
    else:
        if create == True:
            dict = {} if content is None else content
            with open(path, 'w') as file:
                json.dump(dict, file, indent=4)
            return dict
        else:
            raise "path not exist!"

def str_to_identifier(x: str) -> str:
    """Convert a string to a small string with negligible collision probability
    and where the smaller string can be used to identifier the larger string in
    file names.

    Importantly, this function is deterministic between runs and between
    platforms, unlike python's built-in hash function.

    References:
        https://stackoverflow.com/questions/45015180
        https://stackoverflow.com/questions/5297448
    """
    return hashlib.md5(x.encode('utf-8')).hexdigest()
        
def match_and_remove_first_occurrence(main_string: str, sub_string: str):
    index = main_string.find(sub_string)

    if index != -1:
        new_string = main_string[:index] + main_string[index + len(sub_string):]
        return index, new_string
    else:
        return index, main_string

def check_annotated_format(annotated_corpora):
    annotated_corpora = annotated_corpora[:]
    origin = re.findall(r'\[([^[\]]*)\]', annotated_corpora)
    label = re.findall(r'\{([^}]*)\}', annotated_corpora)
    origin_indexs, delete_char_num = [], 0
    for ori in origin:
        index, annotated_corpora = match_and_remove_first_occurrence(annotated_corpora, ori)
        origin_indexs.append(index + delete_char_num)
        if index != -1:
            delete_char_num += len(ori)
    for lab in label:
        _, annotated_corpora = match_and_remove_first_occurrence(annotated_corpora, lab)
    pair = re.findall(r'\[\]\{\}', annotated_corpora)
    if not(len(origin) == len(pair) == len(label) and -1 not in origin_indexs and \
            all(x < y for x, y in zip(origin_indexs, origin_indexs[1:]))):
        return False, origin, label
    else:
        return True, origin, label