import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_openml



def w_fn(x):
    return np.minimum(np.ones_like(x), np.exp(-x))

class AgnosticBooster:
    def __init__(self, T, s):
        self.T = T
        self.s = s
        self.weak_learners = []
        self.gammas = []
        self.best_iterate = None

    def fit(self, X, y, X_val, y_val):
        m, _ = X.shape

        if self.s * self.T > m:
            raise ValueError("Not enough samples for the specified number of rounds and sample size per round.")

        best_accuracy = -np.inf

        for t in range(self.T):
            start_idx = t * self.s
            end_idx = start_idx + self.s
            X_sample = X[start_idx:end_idx]
            y_sample = y[start_idx:end_idx]


            H_sample = self.H(X_sample)
            w_t = w_fn(H_sample * y_sample)

            # Fractional relabeling
            X_extended = np.concatenate((X_sample, X_sample), axis=0)
            y_extended = np.concatenate((y_sample, -y_sample), axis=0)
            weights = np.concatenate(((1 + w_t) / 2, (1 - w_t) / 2), axis=0)


            g_t = DecisionTreeClassifier(max_depth=1)
            g_t.fit(X_extended, y_extended, sample_weight=weights)

            # Compute alpha^t and beta^t
            H = self.H(X)
            w_tot = w_fn(H *y)
            alpha_t = np.mean(g_t.predict(X) * w_tot * y)
            beta_t = np.mean(-np.sign(H) * w_tot * y)

            # Update gamma^t
            if alpha_t >= beta_t or t < 2:
                h_t = g_t.predict
                gamma_t = alpha_t
            else:
                h_t = None
                gamma_t = beta_t

            # Save the weak learner and its coefficient
            self.weak_learners.append(h_t)
            self.gammas.append(gamma_t)

            val_accuracy = accuracy_score(y_val, np.sign(self.partial_H(X_val, t)))
            if val_accuracy > best_accuracy:
                best_accuracy = val_accuracy
                self.best_iterate = t

    def partial_H(self, X, up_to_t):
        result = np.zeros(X.shape[0])
        for gamma, h in zip(self.gammas[:up_to_t], self.weak_learners[:up_to_t]):
            if h is None:
                result -= gamma * np.sign(result)
            else:
                result += gamma * h(X)
        return result

    def H(self, X):
        return self.partial_H(X, len(self.weak_learners))

    def predict(self, X):
        if self.best_iterate is None:
            raise ValueError("The model has not been fitted yet.")
        return np.sign(self.partial_H(X, self.best_iterate))

def phi_prime(z):
    return np.where(z <= 0, -1, -(z + 1) * np.exp(-z))

class SampleEfficientAgnosticBooster:
    def __init__(self, T, s, sigma, eta):
        self.T = T
        self.s = s
        self.sigma = sigma
        self.eta = eta
        self.weak_learners = []
        self.gammas = []
        self.best_iterate = None

    def fit(self, X, y, X_val, y_val):
        m, n = X.shape

        H = np.zeros(m)
        D_X, D_y = np.empty((0, n)), np.empty(0)
        D_w = np.empty(0)

        best_accuracy = -np.inf

        for t in range(self.T):
            start_idx = t * self.s
            end_idx = start_idx + self.s
            X_sample = X[start_idx:end_idx]
            y_sample = y[start_idx:end_idx]

            if t > 0:
                H_sample = self.partial_H(X_sample, t)

            else:
                H_sample = np.zeros(self.s)

            if t > 1:
                H_sample_old = self.partial_H(X_sample, t-1)

            else:
                H_sample_old = np.zeros(self.s)

            w_t = (1-self.sigma) * phi_prime(H_sample_old * y_sample) - phi_prime(H_sample* y_sample)

            # Construct new samples and weights
            new_X = np.tile(X_sample, (2, 1))
            new_y = np.concatenate((y_sample, -y_sample))
            new_weights = np.concatenate(((1 + w_t) / 2, (1 - w_t) / 2), axis=0)

            if t > 0:
                old_weight_sum = np.sum(D_w)
                D_w *= (1 - self.sigma) / old_weight_sum

                new_weight_sum = np.sum(new_weights)
                new_weights *= self.sigma / new_weight_sum

                D_X = np.concatenate((D_X, new_X), axis=0)
                D_y = np.concatenate((D_y, new_y), axis=0)
                D_w = np.concatenate((D_w, new_weights), axis=0)
            else:
                D_X, D_y = new_X, new_y
                new_weight_sum = np.sum(new_weights)
                D_w = new_weights /new_weight_sum

            clf = DecisionTreeClassifier(max_depth=1)
            clf.fit(D_X, D_y, sample_weight=D_w)

            alpha_t = -np.mean(clf.predict(X_sample) * phi_prime(H_sample* y_sample) * y_sample)
            beta_t = -np.mean(-np.sign(H_sample) * phi_prime(H_sample* y_sample) * y_sample)

            if alpha_t >= beta_t or t < 2:
                h_t = clf.predict
                gamma_t = alpha_t
            else:
                h_t = None
                gamma_t = beta_t
            self.weak_learners.append(h_t)
            self.gammas.append(gamma_t)

            val_accuracy = accuracy_score(y_val, np.sign(self.partial_H(X_val, t)))
            if val_accuracy > best_accuracy:
                best_accuracy = val_accuracy
                self.best_iterate = t


    def partial_H(self, X, up_to_t):
        result = np.zeros(X.shape[0])
        for gamma, h in zip(self.gammas[:up_to_t], self.weak_learners[:up_to_t]):
            if h is None:
                result -= gamma * np.sign(result)
            else:
                result += gamma * h(X)
        return result

    def predict(self, X):
        if self.best_iterate is None:
            raise ValueError("The model has not been fitted yet.")
        return np.sign(self.partial_H(X, self.best_iterate))



def repeat_data(X, y, n):
    X_repeated = np.tile(X, (n, 1))
    y_repeated = np.tile(y, n)
    return X_repeated, y_repeated

def load_dataset(name):
    data = fetch_openml(name, version=1, as_frame=True, parser="auto")
    X, y = data.data, data.target

    if name == 'ionosphere':
        y = np.where((y == 'g'), 1, -1)
    elif name == 'diabetes':
        y = np.where((y == 'tested_positive'), 1, -1)
    elif name == 'spambase':
        y = np.where((y == '1'), 1, -1)

    X = pd.DataFrame(X)

    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object', 'category']).columns

    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    X = preprocessor.fit_transform(X)
    return X, y.astype(float)  # Ensure y is of type float

def add_noise(y, noise_level):
    y = np.asarray(y, dtype=float)  # Ensure y is a numpy array of type float
    n_samples = len(y)
    n_noisy = int(n_samples * noise_level)
    indices = np.random.choice(n_samples, n_noisy, replace=False)
    y_noisy = y.copy()
    y_noisy[indices] = -y_noisy[indices]
    return y_noisy

def main():
    datasets = {
        'Ionosphere': 'ionosphere',
        'Pima': 'diabetes',
        'Spambase': 'spambase',
    }

    results = {dataset: {noise: {'AgnosticBooster': [], 'SEAgnosticBooster': []} for noise in [0, 0.05, 0.1, 0.2]} for dataset in datasets}
    best_params = {dataset: {noise: {'AgnosticBooster': {}, 'SEAgnosticBooster': {}} for noise in [0, 0.05, 0.1, 0.2]} for dataset in datasets}

    # Load datasets
    for name in datasets:
        print(f"Running {name}")

        X, y = load_dataset(datasets[name])

        kf = KFold(n_splits=30, shuffle=True, random_state=42)
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            for noise_level in [0, 0.05, 0.1, 0.2]:
                y_train_noisy = add_noise(y_train, noise_level)

                best_agn_acc = -np.inf
                best_agn_T = None

                for T in [25, 50, 100]:
                    X_t, y_t = repeat_data(X_train, y_train_noisy, 1)
                    agn = AgnosticBooster(T=T, s=X_t.shape[0] // T)
                    agn.fit(X_t, y_t, X_test, y_test)
                    y_pred_agn = agn.predict(X_test)
                    agn_acc = accuracy_score(y_test, y_pred_agn)

                    if agn_acc > best_agn_acc:
                        best_agn_acc = agn_acc
                        best_agn_T = T

                results[name][noise_level]['AgnosticBooster'].append(best_agn_acc)
                best_params[name][noise_level]['AgnosticBooster'] = {'T': best_agn_T}

                # Hyperparameter grid for SEAgnosticBooster
                best_sea_acc = -np.inf
                best_sea_T = None
                best_sea_sigma = None

                for T in [25, 50, 100]:
                    for sigma in [0.1, 0.25, 0.5]:
                        X_t, y_t = repeat_data(X_train, y_train_noisy, 1)
                        seab = SampleEfficientAgnosticBooster(T=T, s=X_t.shape[0] // T, sigma=sigma, eta=0.1)
                        seab.fit(X_t, y_t, X_test, y_test)
                        y_pred_seab = seab.predict(X_test)
                        seab_acc = accuracy_score(y_test, y_pred_seab)

                        if seab_acc > best_sea_acc:
                            best_sea_acc = seab_acc
                            best_sea_T = T
                            best_sea_sigma = sigma

                results[name][noise_level]['SEAgnosticBooster'].append(best_sea_acc)
                best_params[name][noise_level]['SEAgnosticBooster'] = {'T': best_sea_T, 'sigma': best_sea_sigma}

    for dataset in results:
        print(f"Dataset: {dataset}")
        for noise_level in results[dataset]:
            agn_mean_acc = np.mean(results[dataset][noise_level]['AgnosticBooster'])
            sagn_mean_acc = np.mean(results[dataset][noise_level]['SEAgnosticBooster'])
            best_agn_params = best_params[dataset][noise_level]['AgnosticBooster']
            best_sea_params = best_params[dataset][noise_level]['SEAgnosticBooster']
            print(f'Noise Level: {noise_level:.0%}, Agnostic Booster Accuracy: {agn_mean_acc:.2f} (Best T: {best_agn_params["T"]}), SEAgnostic Booster Accuracy: {sagn_mean_acc:.2f} (Best T: {best_sea_params["T"]}, Best Sigma: {best_sea_params["sigma"]})')


if __name__ == "__main__":
    main()


