/* ANTIQUATED. I DON'T USE THIS ANYMORE. INSTEAD USE ONE OF THE MORE SPECIFIC MAIN_SAMPLE,
   MAIN_RESAMPLE, MAIN_CONTINUE, MAIN_SAMPLE_RESAMPLE PROGRAMS   
*/

/*---------------------------------------------------
 * file:    main.c
 * purpose: Run nonparametric supervised lda on graph category structure.
 * author:  ahollowa@uci.edu
 * date:    1/11/09
 *-------------------------------------------------*/

#include "mylib.h"     // Contains the C std. lib. headers and global #define statements
#include "sampler.h"   // Sampling routines
#include "graph.h"     // Graph data structure
#include "node.h"      // Node data structure
#include "myio.h"      // I/O routines
#include "alloc.h"     // Memory allocation routines
#include "checksum.h"  // Checksums for count matrices
#include "sparse.h"    // Miscellaneous routines for sparse matrix formats
#include "init.h"      // Initialization routines for Gibb's sampler


/*------------------------------------------
 * global variables
 *------------------------------------------ */
static int W;     // Number of unique words in vocabulary
static int D;     // Number of docs
static int L;     // Maximum depth of graph
static int ntot;  // Number of words in corpus


/*==========================================
 * main
 *========================================== */
int main(int argc, char* argv[]){

  int allow_new_nodes, flag, training; // Boolean variables  
  int train_iter, test_iter, seed, i, j, k, p_estimate, init_capacity, wid, did, lev, node, W2, num_initial_nodes;
  int *d, *w, *docconcept, *path_lengths, *perm_lengths, *levels, *sum_levels, *Nd, *min_depth; 
  int **dl, ***dwl, **cp, **perm, **paths;
  double alpha, beta_prior, eta_prior, a, b;
  double *ll, *pi_d, *beta, *eta;
  char *did_file, *wid_file, *docpath_file, *level_file, *perm_file, *pi_file, *filename;
  char *dir, *outfile, *cpfile, *docconcept_file;
  FILE *fp;
  clock_t begin, end;
  Graph *graph;
  
  /******************************************************
   *					COMMAND LINE
   *******************************************************/
  if( argc == 15 ){
  	  alpha = atof(argv[1]);        // Hyperparameter for stick-breaking beta distribution
	  beta_prior  = atof(argv[2]);  // Hyper-Hyperparameter for stick-breaking beta distribution
	  eta_prior   = atof(argv[3]);  // Hyper-Hyperparameter for topic Dirichlet prior
	  a     = atof(argv[4]);        // Hyperparameter for level Geometric prior
	  b     = atof(argv[5]);        // Hyperparameter for level Geometric prior
	  L     = atoi(argv[6]);        // Maximum number of nodes in a path (i.e. a node can have depth 0, 1, 2 ... L-1)
	  train_iter      = atoi(argv[7]);
	  test_iter       = atoi(argv[8]);
	  seed	          = atoi(argv[9]);
	  did_file        = argv[10];  
	  wid_file        = argv[11];  
	  perm_file       = argv[12];
	  docconcept_file = argv[13];
	  dir             = argv[14];
  }
  else{
	fprintf(stderr, "usage: %s alpha beta_prior eta_prior a b L train_iter test_iter seed did wid perm docconcept output_dir\n", argv[0]);
    exit(-1);
  }
    
  assert(alpha>0);
  assert(beta_prior>0);
  assert(eta_prior>0);
  assert(a>0);
  assert(b>0);
  assert(L>0);
  assert(train_iter>0);
  assert(test_iter>0);
  seedMT(seed);
    
	
 /******************************************************
   *			READ IN COMMAND LINE ARGUMENTS
   *******************************************************/

  //Read in docword matrix
  ntot = countlines(did_file);
  assert(ntot>0);
  d = ivec(ntot);
  w = ivec(ntot);
  read_ivec(ntot, d, did_file);
  read_ivec(ntot, w, wid_file);

  W = -1;
  D = -1;
  for(i=0; i<ntot; i++){
	if(d[i] > D){ D = d[i]; }
	if(w[i] > W){ W = w[i]; }
  }
  W++;
  D++;
  assert(W>0);
  assert(D>0);
  
  //Read in docconcept matrix
  docconcept = ivec(D);
  read_ivec(D, docconcept, docconcept_file);
  
  
  
  /******************************************************
  *				CREATE GRAPH STRUCTURE
  ******************************************************/
  p_estimate = 1;
  init_capacity = 5;
  graph = allocate_graph();

  //Initialize equivalence classes
  graph->num_equiv = ivec(L);
  graph->equivalence = imat(init_capacity, L);
  for(i=0; i < L; i++){  
	graph->num_equiv[i] = 1;
	graph->equivalence[0][i] = NEW_NODE;
	p_estimate *= 2;
  }
  graph->equiv_capacity = init_capacity;
  graph->max_depth = L;

  //Initialize nodes array
  graph->capacity = init_capacity;
  graph->nodes = nodevec(graph->capacity);
  graph->next_avail_id = 0;
  for(i=0; i < graph->capacity; i++){   
	graph->nodes[i].id = NOT_IN_USE;
  }

  add_new_node(graph, 0, 0, L, W, D);
  add_new_node(graph, 1, 1, L, W, D);
  add_new_node(graph, 2, 1, L, W, D);
  add_new_node(graph, 3, 1, L, W, D);
  add_new_node(graph, 4, 2, L, W, D);
 // add_new_node(graph, 5, 2, L, W, D);
  add_new_node(graph, 6, 2, L, W, D);
  add_new_node(graph, 7, 2, L, W, D);
  add_new_node(graph, 8, 2, L, W, D);
  add_new_node(graph, 9, 2, L, W, D);
  update_next_avail_id(graph);


 
 /******************************************************
  *			READ IN STICK-BREAK PERMUTATION
  *******************************************************/

  //Read in stick-breaking permutations
  num_initial_nodes = countlines(perm_file);
  perm_lengths = ivec(num_initial_nodes);
  perm = read_variable_line_i(perm_file, num_initial_nodes, perm_lengths);
  for(i=0; i < num_initial_nodes; i++){
	if( perm_lengths[i] == 1 ){ continue; }
	assert(graph->nodes[i].num_feasible == perm_lengths[i]);
	for(j=0; j < perm_lengths[i]; j++){
	  graph->nodes[i].feasible[j] = perm[i][j];
	  graph->nodes[i].perm[perm[i][j]] = j;
	}
  }


 /******************************************************
  *			INITIALIZE COUNT MATRICES
  *******************************************************/
  levels = ivec(ntot);     // Word token level assignments
  paths  = imat(D,L);      // Document path assignments
  dl     = imat(D,L);      // Total no. words in doc d at level l
  dwl    = i3d( D, W, L);  // No. times in doc. d that word w occurs at level l
  Nd     = ivec(D);        // Total no. words in a document -- don't really need this
  pi_d   = dvec(D);        // Truncated Geometric level distribution parameter
  sum_levels   = ivec(D);  // Sum of word levels in a document
  min_depth    = ivec(D);  // The maximum level of the words in the doc -- all paths must be at least this depth
  path_lengths = ivec(D);  // Length of document paths
  for(i=0; i<ntot; i++) Nd[d[i]]++;



  /*******************************************************
  *			REMOVE CHOSEN NODE FROM DOCCONCEPT ARRAY
  ********************************************************/
  j = 0;
  for(i=0; i < D; i++){
	  if( docconcept[i] == 5 ){
		docconcept[i] = FREE_PATH;
		j++;
	  }
  }
  printf("There are %d documents with FREE_PATH\n",j);

  
  
  /******************************************************
  *		INITIALIZATION AND PRINTING OF INITIAL STATE
  *******************************************************/
  randomassignment_no_edges(L, D, W, ntot, w, d, paths, path_lengths, levels, sum_levels, dwl, dl, min_depth, docconcept, pi_d, graph);
  checksum_etot_k_agg(graph);
  checksum_dwl(dwl, dl, D, W, L);

  outfile = create_filename(dir,"init_path_assigns.txt");
  print_paths(D, path_lengths, paths ,outfile);
  free(outfile);
  
  outfile = create_filename(dir, "original_graph_structure.txt");
  print_graph(graph, outfile);
  free(outfile);

  //Print program parameters
  printf("D  = %d\n", D);
  printf("W  = %d\n", W);
  printf("L  = %d\n", L);
  printf("Graph capacity = %d\n", graph->capacity);
  printf("p_estimate     = %d\n", p_estimate);
  printf("ntot  = %d\n", ntot);
  printf("seed  = %d\n", seed);
  printf("train_iter  = %d\n", train_iter);
  printf("test_iter   = %d\n", test_iter);
  printf("alpha = %f\n", alpha);
  printf("beta_prior  = %f\n", beta_prior);
  printf("eta_prior   = %f\n", eta_prior);
  printf("a     = %f\n", a);
  printf("b     = %f\n", b);

  
 /******************************************************
  *				GIBBS/MH SAMPLING
  *******************************************************/
  training = TRUE;
  ll   = dvec(train_iter + test_iter + 1);
  beta = dvec(train_iter + test_iter + 1);
  eta  = dvec(train_iter + test_iter + 1);
  beta[0] = beta_prior;
  eta[0]  = eta_prior;
  ll[0]   = log_likelihood(graph, paths, path_lengths, levels, d, w, Nd, sum_levels, pi_d, ntot, D, L, W, alpha, beta[0], eta[0], a, b, beta_prior, eta_prior, docconcept, training);
  printf("\tLog Likelihood = %e\n", ll[0]);

  for(i=0; i< (train_iter + test_iter); i++){
	printf("\n\n Iteration %d:\n", i);
	
	/* When we switch from training to testing, we need to add the test documents to the mix. I.e., we need to initialize them all
	   to the root node which entails incrementing the cp matrix and incrementing the edge counts*/
	if( i == train_iter ){
		training = FALSE;
		for(j=0; j < ntot; j++){
			if( docconcept[d[j]] != FREE_PATH){ continue; }
			wid = w[j];
			did = d[j];
			lev = levels[j];
			node = paths[did][ path_lengths[did] - lev];
			dwl[did][wid][lev]++;
			dl[did][lev]++;
			sum_levels[did] += lev;
			increment_cp( &(graph->nodes[node]), wid, 1);
		}
		
		for(j=0; j<D; j++){
			if( docconcept[j] != FREE_PATH){ continue; }
			increment_cnts_sb(graph, paths[j], j, path_lengths[j]);
			min_depth[j] = 0;
		}
	}
	
	//For testing, we set eta and beta to their initial values and once again learn the best hyperparameter
	//setting for testing (i.e. as opposed to carrying on the best value found from training)
	if( i == train_iter ){
		beta[i+1] = sample_beta(beta_prior,beta_prior, alpha, graph, rgen);
		eta[i+1]  = sample_eta(eta_prior, eta_prior, W, graph, rgen);
	}else{
		beta[i+1] = sample_beta( beta_prior, beta[i], alpha, graph, rgen);
		eta[i+1]  = sample_eta(eta_prior, eta[i], W, graph, rgen);
	}	
	
	
	begin = clock();
	sample_nonparam_z(L, W, D, p_estimate, alpha, beta[i+1], eta[i+1], ntot, d, w, paths, path_lengths, dwl, dl, min_depth, graph, allow_new_nodes, docconcept, training);
	end = clock();
	checksum_etot_k_agg(graph);
	printf("\tGibbs Sampling of Z: %f seconds\n", ((float)end/CLOCKS_PER_SEC - (float)begin/CLOCKS_PER_SEC)); 

	begin = clock();
	sample_level_assignments_geometric( L, W, D, ntot, eta[i+1], pi_d, paths, d, w, dl, dwl, path_lengths, levels, sum_levels, Nd, min_depth, docconcept, graph, training);
	checksum_dwl(dwl, dl, D, W, L);
	end = clock();
	printf("\tGibbs Sampling of L: %f seconds\n", ((float)end/CLOCKS_PER_SEC - (float)begin/CLOCKS_PER_SEC)); 

	begin = clock();
	sample_nonparam_pi(D, ntot, a, b, d, pi_d, path_lengths, levels, sum_levels, Nd, dl, docconcept, graph, training, rgen);
	end = clock();
	printf("\tGibbs Sampling of Pi: %f seconds\n", ((float)end/CLOCKS_PER_SEC - (float)begin/CLOCKS_PER_SEC));

	
	ll[i+1] = log_likelihood(graph, paths, path_lengths, levels, d, w, Nd, sum_levels, pi_d, ntot, D, L, W, alpha, beta[i+1], eta[i+1], a, b, beta_prior, eta_prior, docconcept, training);
	printf("\tLog Likelihood = %e\n", ll[i+1]);

	if( i % 20 == 0  && i > 0){
		
//		filename = calloc(BUFF_SIZE+1, sizeof(char));
//		sprintf(filename, "path_iter%d.txt", i);		
//		outfile = create_filename(dir, filename);
//		write_variable_line_i(outfile, paths, path_lengths, D);
//		free(outfile);
//		free(filename);
//		
//		filename = calloc(BUFF_SIZE+1, sizeof(char));
//		sprintf(filename, "pi_iter%d.txt",i);
//		outfile = create_filename(dir, filename);
//		write_dvec(D, pi_d, outfile);
//		free(outfile); 
//		free(filename);
//		
//		filename = calloc(BUFF_SIZE+1, sizeof(char));
//		sprintf(filename, "levels_iter%d.txt", i);		
//		outfile = create_filename(dir, filename);
//		write_count_ivec(ntot, levels, outfile);
//		free(outfile);
//		free(filename);
//		
//		
//		filename = calloc(BUFF_SIZE+1, sizeof(char));
//		sprintf(filename, "cp_iter%d.txt", i);		
//		outfile = create_filename(dir, filename);
//		print_cp(graph, W, L, outfile);
//		free(outfile);	
//		free(filename);
		
		filename = calloc(BUFF_SIZE+1, sizeof(char));
		sprintf(filename, "graph_iter%d.txt",i);
		outfile = create_filename(dir, filename);
		print_graph(graph, outfile);
		free(outfile);
		free(filename);

//		filename = calloc(BUFF_SIZE+1, sizeof(char));
//		sprintf(filename, "ll_iter%d.txt",i);
//		outfile = create_filename(dir, filename);
//		write_dvec(i, ll, outfile);
//		free(outfile);
//		free(filename);

		filename = calloc(BUFF_SIZE+1, sizeof(char));
		sprintf(filename, "perm_iter%d.txt",i);
		outfile = create_filename(dir, filename);
		print_perm(graph, outfile);
		free(outfile);
		free(filename);	
	
		filename = calloc(BUFF_SIZE+1, sizeof(char));
		sprintf(filename, "etot_iter%d.txt",i);
		outfile = create_filename(dir,filename);
		fp = fopen(outfile, "w"); assert(fp);
		for(j=0; j < graph->capacity; j++){
			if( graph->nodes[j].id==NOT_IN_USE){continue;}
			for(k=0; k < graph->nodes[j].k_capacity; k++){
				fprintf(fp, "%d ", graph->nodes[j].etot_k[k]);
			}
			fprintf(fp,"\n");
		}
		fclose(fp);
		free(outfile);
		free(filename);
	}

	
	printf("\tGraph next avail. id: %d\n", graph->next_avail_id);
  }
  
   
  /******************************************************
  *				PRINT STATE OF CHAIN
  *******************************************************/

  outfile = create_filename(dir, "beta.txt");
  write_dvec(train_iter + test_iter + 1, beta, outfile);
  free(outfile);
  
  outfile = create_filename(dir, "eta.txt");
  write_dvec(train_iter + test_iter + 1, eta, outfile);
  free(outfile);
  
  outfile = create_filename(dir,"ll.txt");
  write_dvec(train_iter + test_iter + 1, ll, outfile);
  free(outfile);
  
  outfile = create_filename(dir, "pi.txt");
  write_dvec(D, pi_d, outfile);
  free(outfile); 
  
  outfile = create_filename(dir, "path.txt");
  write_variable_line_i(outfile, paths, path_lengths, D);
  free(outfile);
    
  outfile = create_filename(dir,"levels.txt");
  write_count_ivec(ntot, levels, outfile);
  free(outfile);
  
  outfile = create_filename(dir, "cp.txt");
  print_cp(graph, W, L, outfile);
  free(outfile);
  
  outfile = create_filename(dir, "perm.txt");
  print_perm(graph, outfile);
  free(outfile);
  
  outfile = create_filename(dir, "final_graph_structure.txt");
  print_graph(graph, outfile);
  free(outfile);

  return 0;
}
