import numpy as np
import pandas as pd
import random
from helper import *
import warnings
from sklearn.model_selection import train_test_split
import h5py
warnings.filterwarnings('ignore')

path = './warfarin.h5'
f = h5py.File(path, 'r')

data = f['data']
labels = [l.decode('utf-8') for l in f['labels']]

var_name = ['age', 'dose', 'Black or African American', 'Asian',
          'v_A/A', 'v_A/G', 'v_G/G', 'v_nan', 'amiodarone','enzyme',
            'c_*1/*1', 'c_*1/*11', 'c_*1/*13', 'c_*1/*14', 'c_*1/*2',
            'c_*1/*3', 'c_*1/*5', 'c_*1/*6', 'c_*2/*2', 'c_*2/*3', 'c_*3/*3','c_nan']
alpha_list = [0,0.2,0.4,0.6,0.8,0.95]

model_names = ['Regression','Ridge', 'XGBoost','Gradient Boosting Regressor','Random Forrest Regression']

data_pd = pd.DataFrame(np.array(data[:]).T, columns=labels)
warfarin_X = data_pd.loc[:, data_pd.columns != 'dose']
warfarin_Y = data_pd['dose'].to_numpy() ** 0.5

train_prop = 0.7

cvar_list = {}

num_run = 50
for s in np.arange(num_run):
    seed = int(s)
    print(seed)
    X_train, X_remaining, Y_train, Y_remaining = train_test_split(warfarin_X, warfarin_Y, train_size=train_prop, random_state=seed)
    X_cvar, X_test, Y_cvar, Y_test = train_test_split(X_remaining, Y_remaining, train_size=0.9, random_state=seed+20)

    train_names = X_train.columns
    cvar_warfarin = dict.fromkeys(model_names)
    model_warfarin = dict.fromkeys(model_names)
    for m_name in model_names:
        cur_model = gridsearch(X_train, Y_train, m_name, seed)
        model_warfarin[m_name] = cur_model
        X_remaining['loss_' + m_name] = np.asarray((cur_model.predict(X_remaining[train_names]) - Y_remaining)**2)
        X_cvar['loss'+ m_name] = np.asarray((cur_model.predict(X_cvar[train_names]) - Y_cvar)**2)
        X_cvar['dose'] = Y_cvar
        cvar_warfarin[m_name]  = cvar_step(X_cvar, Y_cvar, var_name, alpha_list, 'loss'+ m_name, seed)
    cvar_list[seed] = cvar_warfarin
    
import matplotlib.pyplot as plt

mean_dict = dict.fromkeys(model_names)
std_dict = dict.fromkeys(model_names)

for m_name in model_names:
    df = pd.DataFrame()
    for s in np.arange(num_run):
        seed = int(s)
        df[seed] = cvar_list[seed][m_name]
    mean_dict[m_name] = df.mean(axis=1)
    std_dict[m_name] = df.std(axis=1)

model_selected = ['Regression', 'Ridge', 'XGBoost','Random Forrest Regression']
m_names = ['Linear Regression', 'Ridge Regression', 'XGBoost', 'Random Forest']

width = 0.12  # the width of the bars

labels = ['100%','80', '60%', '40%', '20%', '5%']
x = np.arange(len(labels))  # the label locations
colors=['#4daf4a', '#ff7f00', '#984ea3', '#FFD43B', '#a65628', '#f781bf', '#e41a1c', '#377eb8']

fig, ax = plt.subplots(figsize=(6,4))

rect = {}
for (i,m) in enumerate(model_selected):
    rect[m] = ax.bar(x + (-1.1+i)*width, np.array(mean_dict[m]), width, 
                     label=m_names[i], color=colors[i], 
                     yerr = 1.96*std_dict[m]/(num_run**0.5), 
                     error_kw=dict(lw=0.5, capsize=1, capthick=-1))    

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel(r'$\mathsf{W}_{\alpha}$', fontsize=15)
ax.set_xlabel('Subpopulation size ' + r'$\alpha$', fontsize=15)
# ax.set_title('Warfarin, CVaR metric at varying thresholds')
ax.set_xticks(x)
ax.set_xticklabels(labels, fontsize=15)
ax.legend(loc='upper left', fontsize=14)

ax.set_ylim((0,8))
plt.savefig('warfarin-cvar.png', bbox_inches="tight")
