from transformers import PreTrainedTokenizer
import re
import torch
import random

class CustomCharLevelTokenizer(PreTrainedTokenizer):
    def __init__(self, **kwargs):
        # Define the characters to tokenize
        characters = '0123456789+-x='

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

        super().__init__(**kwargs)

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

    @property
    def vocab_size(self):
        return len(self.vocab)

    def get_vocab(self):
        return self.vocab

    def _tokenize(self, text):
        # Tokenize the text character by character
        return [char if char in self.vocab else self.unk_token for char in text]

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab[self.unk_token])

    def _convert_id_to_token(self, index):
        # Convert an ID to its corresponding token
        return self.ids_to_tokens.get(index, self.unk_token)

    def __call__(self, text, **kwargs):
        # Tokenize text and convert to input IDs
        tokens = self._tokenize(text)
        input_ids = [self._convert_token_to_id(token) for token in tokens]
        return {"input_ids": input_ids}

    def decode(self, token_ids, **kwargs):
        # Convert token IDs to tokens and join into a string
        tokens = [self._convert_id_to_token(token_id) for token_id in token_ids]
        return ''.join(tokens).replace(self.pad_token, '').replace(self.bos_token, '').replace(self.eos_token, '')

class CustomCharLevelTokenizerWithWhiteSpace(PreTrainedTokenizer):
    def __init__(self, **kwargs):
        # Define the characters to tokenize
        characters = '0123456789+-x= '

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

        super().__init__(**kwargs)

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

    @property
    def vocab_size(self):
        return len(self.vocab)

    def get_vocab(self):
        return self.vocab

    def _tokenize(self, text):
        # Tokenize the text character by character
        text = re.sub('\s+',' ',text)
        return [char if char in self.vocab else self.unk_token for char in text]

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab[self.unk_token])

    def _convert_id_to_token(self, index):
        # Convert an ID to its corresponding token
        return self.ids_to_tokens.get(index, self.unk_token)

    def __call__(self, text, **kwargs):
        # Tokenize text and convert to input IDs
        tokens = self._tokenize(text)
        input_ids = [self._convert_token_to_id(token) for token in tokens]
        return {"input_ids": input_ids}

    def decode(self, token_ids, **kwargs):
        # Convert token IDs to tokens and join into a string
        tokens = [self._convert_id_to_token(token_id) for token_id in token_ids]
        return ''.join(tokens).replace(self.pad_token, '').replace(self.bos_token, '').replace(self.eos_token, '')

class CustomCharLevelTokenizerForAddingPadding(PreTrainedTokenizer):
    def __init__(self, **kwargs):
        # Define the characters to tokenize
        characters = '0123456789+-x= '

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

        super().__init__(**kwargs)

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

    @property
    def vocab_size(self):
        return len(self.vocab)

    def get_vocab(self):
        return self.vocab

    def _tokenize(self, text):
        # Tokenize the text character by character
        # text = re.sub('\s+',' ',text)
        temp = [char if char in self.vocab else self.unk_token for char in text]
        temp = [item.replace(' ', '[PAD]') for item in temp]
        return temp

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab[self.unk_token])

    def _convert_id_to_token(self, index):
        # Convert an ID to its corresponding token
        return self.ids_to_tokens.get(index, self.unk_token)

    def __call__(self, text, **kwargs):
        # Tokenize text and convert to input IDs
        tokens = self._tokenize(text)
        input_ids = [self._convert_token_to_id(token) for token in tokens]
        return {"input_ids": input_ids}

    def decode(self, token_ids, **kwargs):
        # Convert token IDs to tokens and join into a string
        tokens = [self._convert_id_to_token(token_id) for token_id in token_ids]
        return ''.join(tokens).replace(self.pad_token, '').replace(self.bos_token, '').replace(self.eos_token, '')

class PrefixSumsTokenizer(PreTrainedTokenizer):
    def __init__(self, **kwargs):
        # Define the characters to tokenize
        characters = '01= '

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

        super().__init__(**kwargs)

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

    @property
    def vocab_size(self):
        return len(self.vocab)

    def get_vocab(self):
        return self.vocab

    def _tokenize(self, text):
        # Tokenize the text character by character
        text = re.sub('\s+',' ',text)
        temp = [char if char in self.vocab else self.unk_token for char in text]
        temp = [item.replace(' ', '[PAD]') for item in temp]
        return temp

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab[self.unk_token])

    def _convert_id_to_token(self, index):
        # Convert an ID to its corresponding token
        return self.ids_to_tokens.get(index, self.unk_token)

    def __call__(self, text, **kwargs):
        # Tokenize text and convert to input IDs
        tokens = self._tokenize(text)
        input_ids = [self._convert_token_to_id(token) for token in tokens]
        return {"input_ids": input_ids}

    def decode(self, token_ids, **kwargs):
        # Convert token IDs to tokens and join into a string
        tokens = [self._convert_id_to_token(token_id) for token_id in token_ids]
        return ''.join(tokens).replace(self.pad_token, '').replace(self.bos_token, '').replace(self.eos_token, '')
    

class CustomCharLevelTokenizerForAddingPadding_Base100(PreTrainedTokenizer):
    def __init__(self, **kwargs):
        # Define the characters to tokenize
        base10 = '0123456789'
        characters = []
        for i in range(len(base10)):
            for j in range(len(base10)):
                characters.append(str(i)+str(j))

        self.digit_length = len(max(characters, key=len, default=""))

        characters = characters + ['+','-','x','=',' ']
        # print("characters is: " ,characters)

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

        super().__init__(**kwargs)

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

    def base_10_to_base_b(self, n: int, b: int = 100):
        # returns list of strings e.g. [3, 5, 1]
        if n == 0:
            return [0]
        digits = []
        while n:
            digits.append(str(n % b))
            n //= b
        return digits[::-1]

    @property
    def vocab_size(self):
        return len(self.vocab)

    def get_vocab(self):
        return self.vocab

    def _tokenize(self, text):
        # Tokenize the text character by character
        result_list = re.findall(r'(\d+|\s+|[-+x=])', text) # split into list of numbers, chars and white space
        output = []
        for element in result_list:
            if element.isdigit(): 
                based = self.base_10_to_base_b(int(element)) # convert all digits to base 100
                based = [str(char).zfill(self.digit_length) for char in based] # match to token in vocab, may need to pad as 1 won't match with 01
                temp = [char if char in self.vocab else self.unk_token for char in based]
            else:
                element_list = [element]
                temp = [char if char in self.vocab else self.unk_token for char in element_list]
            output += temp
            output = [item.replace(' ', '[PAD]') for item in output] # white space to pad tokens
        return output

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab[self.unk_token])

    def _convert_id_to_token(self, index):
        # Convert an ID to its corresponding token
        return self.ids_to_tokens.get(index, self.unk_token)

    def __call__(self, text, **kwargs):
        # Tokenize text and convert to input IDs
        tokens = self._tokenize(text)
        input_ids = [self._convert_token_to_id(token) for token in tokens]
        return {"input_ids": input_ids}

    def decode(self, token_ids, **kwargs):
        # Convert token IDs to tokens and join into a string
        tokens = [self._convert_id_to_token(token_id) for token_id in token_ids]
        return ''.join(tokens).replace(self.pad_token, '').replace(self.bos_token, '').replace(self.eos_token, '')
    
class CustomCharLevelTokenizerForAddingPadding_Base1000(PreTrainedTokenizer):
    def __init__(self, **kwargs):
        # Define the characters to tokenize
        base10 = '0123456789'
        characters = []
        for i in range(len(base10)):
            for j in range(len(base10)):
                for k in range(len(base10)):
                    characters.append(str(i)+str(j)+str(k))

        self.digit_length = len(max(characters, key=len, default=""))

        characters = characters + ['+','-','x','=',' ']
        # print("characters is: " ,characters)

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

        super().__init__(**kwargs)

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

    def base_10_to_base_b(self, n: int, b: int = 1000):
        # returns list of strings e.g. [3, 5, 1]
        if n == 0:
            return [0]
        digits = []
        while n:
            digits.append(str(n % b))
            n //= b
        return digits[::-1]

    @property
    def vocab_size(self):
        return len(self.vocab)

    def get_vocab(self):
        return self.vocab

    def _tokenize(self, text):
        # Tokenize the text character by character
        result_list = re.findall(r'(\d+|\s+|[-+x=])', text) # split into list of numbers, chars and white space
        output = []
        for element in result_list:
            if element.isdigit(): 
                based = self.base_10_to_base_b(int(element)) # convert all digits to base 1000
                based = [str(char).zfill(self.digit_length) for char in based] # match to token in vocab, may need to pad as 1 won't match with 01
                temp = [char if char in self.vocab else self.unk_token for char in based]
            else:
                element_list = [element]
                temp = [char if char in self.vocab else self.unk_token for char in element_list]
            output += temp
            output = [item.replace(' ', '[PAD]') for item in output] # white space to pad tokens
        return output

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab[self.unk_token])

    def _convert_id_to_token(self, index):
        # Convert an ID to its corresponding token
        return self.ids_to_tokens.get(index, self.unk_token)

    def __call__(self, text, **kwargs):
        # Tokenize text and convert to input IDs
        tokens = self._tokenize(text)
        input_ids = [self._convert_token_to_id(token) for token in tokens]
        return {"input_ids": input_ids}

    def decode(self, token_ids, **kwargs):
        # Convert token IDs to tokens and join into a string
        tokens = [self._convert_id_to_token(token_id) for token_id in token_ids]
        return ''.join(tokens).replace(self.pad_token, '').replace(self.bos_token, '').replace(self.eos_token, '')
    
class CustomCharLevelTokenizerForAddingPaddingForMod(PreTrainedTokenizer):
    def __init__(self, **kwargs):
        # Define the characters to tokenize
        characters = '()0123456789+-x= '

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

        super().__init__(**kwargs)

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

    @property
    def vocab_size(self):
        return len(self.vocab)

    def get_vocab(self):
        return self.vocab

    def _tokenize(self, text):
        # Tokenize the text character by character
        # text = re.sub('\s+',' ',text)
        temp = [char if char in self.vocab else self.unk_token for char in text]
        temp = [item.replace(' ', '[PAD]') for item in temp]
        return temp

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab[self.unk_token])

    def _convert_id_to_token(self, index):
        # Convert an ID to its corresponding token
        return self.ids_to_tokens.get(index, self.unk_token)

    def __call__(self, text, **kwargs):
        # Tokenize text and convert to input IDs
        tokens = self._tokenize(text)
        input_ids = [self._convert_token_to_id(token) for token in tokens]
        return {"input_ids": input_ids}

    def decode(self, token_ids, **kwargs):
        # Convert token IDs to tokens and join into a string
        tokens = [self._convert_id_to_token(token_id) for token_id in token_ids]
        return ''.join(tokens).replace(self.pad_token, '').replace(self.bos_token, '').replace(self.eos_token, '')


class CustomCharLevelTokenizerForDelete(PreTrainedTokenizer):
    def __init__(self, **kwargs):
        # Define the characters to tokenize
        characters = '0123456789+-x= D'

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

        super().__init__(**kwargs)

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

    @property
    def vocab_size(self):
        return len(self.vocab)

    def get_vocab(self):
        return self.vocab

    def _tokenize(self, text):
        # Tokenize the text character by character
        # text = re.sub('\s+',' ',text)
        temp = [char if char in self.vocab else self.unk_token for char in text]
        temp = [item.replace(' ', '[PAD]') for item in temp]
        return temp

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab[self.unk_token])

    def _convert_id_to_token(self, index):
        # Convert an ID to its corresponding token
        return self.ids_to_tokens.get(index, self.unk_token)

    def __call__(self, text, **kwargs):
        # Tokenize text and convert to input IDs
        tokens = self._tokenize(text)
        input_ids = [self._convert_token_to_id(token) for token in tokens]
        return {"input_ids": input_ids}

    def decode(self, token_ids, **kwargs):
        # Convert token IDs to tokens and join into a string
        tokens = [self._convert_id_to_token(token_id) for token_id in token_ids]
        return ''.join(tokens).replace(self.pad_token, '').replace(self.bos_token, '').replace(self.eos_token, '')

class CustomCharLevelTokenizerForAddingPaddingWithIndexHints(PreTrainedTokenizer):
    def __init__(self, **kwargs):
        # Define the characters to tokenize
        characters = '0123456789+-x= '
        self.char_set = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwyz!@£#$%^&*()~?.,<>{}[]:;/|βΓΔδεζηθκΛλμΞξΠπΣςτΦφχΨψΩω"
        characters = characters + self.char_set

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

        super().__init__(**kwargs)

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}
        
    @property
    def vocab_size(self):
        return len(self.vocab)

    def get_vocab(self):
        return self.vocab

    def _tokenize(self, text):
        # Tokenize the text character by character
        # text = re.sub('\s+',' ',text)
        temp = [char if char in self.vocab else self.unk_token for char in text]
        temp = [item.replace(' ', '[PAD]') for item in temp]
        return temp

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab[self.unk_token])

    def _convert_id_to_token(self, index):
        # Convert an ID to its corresponding token
        return self.ids_to_tokens.get(index, self.unk_token)

    def __call__(self, text, **kwargs):
        # Tokenize text and convert to input IDs
        tokens = self._tokenize(text)
        input_ids = [self._convert_token_to_id(token) for token in tokens]
        return {"input_ids": input_ids}

    def decode(self, token_ids, **kwargs):
        # Convert token IDs to tokens and join into a string
        tokens = [self._convert_id_to_token(token_id) for token_id in token_ids]
        return ''.join(tokens).replace(self.pad_token, '').replace(self.bos_token, '').replace(self.eos_token, '')


class CustomCharLevelTokenizerSort(PreTrainedTokenizer):
    def __init__(self, **kwargs):
        # Define the characters to tokenize
        characters = '0123456789D,:= '
        set_of_chars = ['A', 'B', 'C', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
                        'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
                        'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', '!', '@', '£', '#', '$', '%', '^',
                        '&', '*', '(', ')', '~', '?', '.', '<', '>', '{', '}', '[', ']', ';', '/', '|', 'β', 'Γ', 'Δ',
                        'δ', 'ε', 'ζ', 'η', 'θ', 'κ', 'Λ', 'λ', 'μ', 'Ξ', 'ξ', 'Π', 'π', 'Σ', 'ς', 'τ', 'Φ', 'φ', 'χ',
                        'Ψ', 'ψ', 'Ω', 'ω']
        self.char_set = ''.join(set_of_chars)
        characters = characters + self.char_set

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

        super().__init__(**kwargs)

        # Define and set special tokens
        self.pad_token = '[PAD]'
        self.unk_token = '[UNK]'
        self.bos_token = '[BOS]'
        self.eos_token = '[EOS]'

        # Combine characters and special tokens to form the custom vocabulary
        self.vocab = {char: i + 4 for i, char in enumerate(characters)}  # Starting from 4 to account for special tokens
        self.vocab.update({self.pad_token: 0, self.unk_token: 1, self.bos_token: 2, self.eos_token: 3})

        # Create the reverse mapping from IDs to tokens
        self.ids_to_tokens = {id: token for token, id in self.vocab.items()}

    @property
    def vocab_size(self):
        return len(self.vocab)

    def get_vocab(self):
        return self.vocab

    def _tokenize(self, text):
        # Tokenize the text character by character
        # text = re.sub('\s+',' ',text)
        temp = [char if char in self.vocab else self.unk_token for char in text]
        temp = [item.replace(' ', '[PAD]') for item in temp]
        return temp

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab[self.unk_token])

    def _convert_id_to_token(self, index):
        # Convert an ID to its corresponding token
        return self.ids_to_tokens.get(index, self.unk_token)

    def __call__(self, text, **kwargs):
        # Tokenize text and convert to input IDs
        tokens = self._tokenize(text)
        input_ids = [self._convert_token_to_id(token) for token in tokens]
        return {"input_ids": input_ids}

    def decode(self, token_ids, **kwargs):
        # Convert token IDs to tokens and join into a string
        tokens = [self._convert_id_to_token(token_id) for token_id in token_ids]
        return ''.join(tokens).replace(self.pad_token, '').replace(self.bos_token, '').replace(self.eos_token, '')
