%%  Test Various Spectral Methods for Clustering
%    on Various Datasets. 
%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%            Load 3D Manifold Learning Dataset                %%
%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% 1a) Generate 2D Circles Dataset
clear all; close all; clc;
num_samples = 1000;
dim_samples = 2;
num_classes = 2;

[X,labels]  = ml_circles_data(num_samples,dim_samples,num_classes, true);
X = X'; labels = labels';
[N, M] = size(X);

% Plot original data
plot_options            = [];
plot_options.is_eig     = false;
plot_options.labels     = labels;
plot_options.title      = 'Circles Data';

if exist('h1','var') && isvalid(h1), delete(h1);end
h1 = ml_plot_data(X',plot_options);
axis tight

%% 1b) Generate 3D Broken Swiss Roll Dataset
clear all; close all; clc;
% Options
options                 = [];
options.numberOfPoints  = 1000;
options.name            = 'swissroll_broken';
options.plot            = false;

% Generate dataset
[X,labels,~,~] = ml_generate_manifold_dataset(options);
X = X'; labels = labels';
[N, M] = size(X);

% Plot original data
plot_options             = [];
plot_options.is_eig      = false;
plot_options.points_size = 30;
plot_options.labels      = labels;
plot_options.title       = 'Broken SwissRoll';

if exist('h1','var') && isvalid(h1), delete(h1);end
h1 = ml_plot_data(X',plot_options);

%% 1c) Load Breast Cancer Dataset
clear all; close all; clc;
[X,labels,class_names] = ml_load_data('breast-cancer-wisconsin.csv','csv','last');
X = X'; labels = labels';
[N, M] = size(X);

% Plot original data
plot_options             = [];
plot_options.is_eig      = false;
plot_options.points_size = 30;
plot_options.labels      = labels;
plot_options.title       = 'Breast Cancer Wisconsin';

if exist('h1','var') && isvalid(h1), delete(h1);end
h1 = ml_plot_data(X',plot_options);

%% 1d) Load the Digits Dataset
clear all; close all; clc;
[X,labels] = ml_load_digits_64('data/digits.csv',[0 1 2 3 4 5]);

% Plot images
if exist('hi','var') && isvalid(hi), delete(hi);end
idx = randperm(size(X,1));
hi  = ml_plot_images((X(idx(1:64),:)),[8 8]);

% Plot datapoints as images
idx                         = randperm(size(X,1));
plot_options                = [];
plot_options.is_eig         = false;
plot_options.labels         = labels(idx(1:64));
plot_options.title          = 'Original Image Data';

if exist('h1','var') && isvalid(h1), delete(h1);end
h1  = ml_plot_data(X(idx(1:64),[ 1  10  19  28  37  46  55 64]),plot_options);

X = X'; labels = labels';
[N, M] = size(X);

%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%            2)  Apply Kernel PCA on Dataset                 %%
%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% 2a) Compute kernel PCA of Dataset and Check Eigenvalues
% Compute kPCA with ML_toolbox
options = [];
options.method_name        = 'KPCA';  % Choosing kernel-PCA method
options.nbDimensions       = 20;       % Number of Eigenvectors to compute.
options.kernel             = 'gauss'; % Type of Kernel: {'poly', 'gauss'}
options.kpar               = [2];     % Variance for the RBF Kernel
                                      % For 'poly' kpar = [offset degree]
options.norm_K             = true;    % Normalize the Gram Matrix                                 
[proj_KPCA_X, mappingkPCA] = ml_projection(X',options);

% Plot EigenValues to try to find the optimal "p"
if exist('h3a','var') && isvalid(h3a), delete(h3a);end
h3a = ml_plot_eigenvalues(diag(mappingkPCA.L));

% Plot result of Kernel PCA
plot_options              = [];
plot_options.is_eig       = false;
plot_options.labels         = labels;
plot_options.plot_labels  = {'$y_1$','$y_2$','$y_3$'};
plot_options.title        = 'Projected data with kernel PCA';
if exist('h3','var') && isvalid(h3), delete(h3);end
h3 = ml_plot_data(proj_KPCA_X(:,[1:5]),plot_options);

%% 2b) Plot Isolines of EigenVectors
iso_plot_options                    = [];
iso_plot_options.xtrain_dim         = [1 2];   % Dimensions of the orignal data to consider when computing the gramm matrix (since we are doing 2D plots, original data might be of higher dimension)
iso_plot_options.eigen_idx          = [1:4];   % Eigenvectors to use.
iso_plot_options.b_plot_data        = true;    % Plot the training data on top of the isolines 
iso_plot_options.labels             = labels;  % Plotted data will be colored according to class label
iso_plot_options.b_plot_colorbar    = false;   % Plot the colorbar.
iso_plot_options.b_plot_surf        = false;   % Plot the isolines as (3d) surface 

% Construct Kernel Data
kernel_data                         = [];
kernel_data.alphas                  = mappingkPCA.V;
kernel_data.kernel                  = mappingkPCA.kernel;
kernel_data.kpar                    = [mappingkPCA.param1,mappingkPCA.param2];
kernel_data.xtrain                  = X';
kernel_data.eigen_values            = mappingkPCA.L;

if exist('h_isoline','var') && isvalid(h_isoline), delete(h_isoline);end
[h_isoline,h_eig] = ml_plot_isolines(iso_plot_options,kernel_data);

%% 2c) Grid Search on the Gaussian kernel hyperparameter
grid_options = [];
grid_options.method_name       = 'KPCA';
grid_options.nbDimensions      = 20;     % Maximum number of Eigenvectors
grid_options.kernel            = 'gauss';% Kernel Type

%%% Example RBF Kernel: 1 Row of sigma values %%%
% kpars = [0.05,0.1, 0.2, 0.25, 0.5, 1, 1.5, 2, 5]; 
% kpars = [1, 1.5, 2, 2.5, 5, 7.5, 10, 15]; 
kpars = logspace(log10(0.5),log10(10),10);

[ eigenvalues ] = ml_kernel_grid_search(X',grid_options,kpars);

if exist('h_eig2','var')     && isvalid(h_eig2),     delete(h_eig2);    end
h_eig2  = ml_plot_kpca_eigenvalues(real(eigenvalues),kpars);


%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%         3)  Apply Laplacian Eigenmaps on Dataset           %%
%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% 3a) Compute Laplacian Eigenmap with ML_toolbox
options = [];
options.method_name       = 'Laplacian';
options.nbDimensions      = 20; % Number of Eigenvectors to compute.
options.neighbors         = 7;  % Number of k-NN for Adjacency Graph
options.sigma             = 5; % Sigma for Similarity Matrix

try
    [proj_LAP_X, mappingLAP]  = ml_projection(X',options);
catch
    error('Please enter a higher number of neighbors or try different sigma!')
end

% Plot EigenValues to try to find the optimal "p"
if exist('h3a','var') && isvalid(h3a), delete(h3a);end
h3a = ml_plot_eigenvalues(diag(mappingLAP.val));

% Plot result of Laplacian Eigenmaps Projection
plot_options              = [];
plot_options.is_eig       = false;
plot_options.labels       = labels;
plot_options.plot_labels  = {'$y_1$','$y_2$','$y_3$'};
plot_options.title        = 'Projected data with Laplacian Eigenmaps';
if exist('h4','var') && isvalid(h4), delete(h4);end
h4 = ml_plot_data(proj_LAP_X(:,[1:6]),plot_options);

%% 3b) Grid Search on the number of neighbors
options = [];
options.method_name       = 'Laplacian';
options.nbDimensions      = 20;
options.sigma             = 5; 

neighborsPars = [5 7 10 15 20]; 
% neighborsPars = [30 50 70 100]; 

[ eigenvalues ] = ml_neighbors_grid_search(X',options,neighborsPars);

if exist('h_eig_lap','var')     && isvalid(h_eig_lap),     delete(h_eig_lap);    end
h_eig_lap  = ml_plot_kpca_eigenvalues(eigenvalues,neighborsPars);

%% 3c) Grid Search on the width of the kernel

options = [];
options.method_name       = 'Laplacian';
options.nbDimensions      = 20;
options.neighbors         = 7; 

sigmaPars = [0.1, 0.2, 0.25, 0.5, 1, 1.5, 2, 5, 7]; 
% sigmaPars = [ 1, 2.5, 5, 7, 10]; 
% sigmaPars = logspace(log10(0.5),log10(15),10);


[ eigenvalues ] = ml_kernel_lap_grid_search(X',options,sigmaPars);

if exist('h_eig_lap2','var')     && isvalid(h_eig_lap2),     delete(h_eig_lap2);    end
h_eig_lap2  = ml_plot_kpca_eigenvalues(eigenvalues,sigmaPars);


%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%         4)  Apply Isometric Mapping on Dataset             %%
%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% 4) Compute Isomap with ML_toolbox
options = [];
options.method_name       = 'Isomap';
options.nbDimensions      = 10;       % Number of Eigenvectors to compute.
options.neighbors         = 20;      % Number of k-NN for Adjacency Graph

try
    [proj_ISO_X, mappingISO]  = ml_projection(X',options);
catch
    error('Please enter a higher number of neighbors')
end

if length(proj_ISO_X)<M
    warning('Graph has disconnected components! \n Only %d points from the largest connected component were projected! \n There Ids are stored in: mappingISO.conn_comp.', length(proj_ISO_X))
end

% Plot EigenValues to try to find the optimal "p"
if exist('h3a','var') && isvalid(h3a), delete(h3a);end
h3a = ml_plot_eigenvalues(diag(mappingISO.val));

%% Plot result of Isometric Mapping Projection
plot_options              = [];
plot_options.is_eig       = false;
plot_options.labels       = labels(mappingISO.conn_comp);
plot_options.plot_labels  = {'$y_1$','$y_2$','$y_3$'};
plot_options.title        = 'Projected data with Isometric Mapping';
if exist('h5','var') && isvalid(h5), delete(h5);end
h5 = ml_plot_data(proj_ISO_X(:,[1:5]),plot_options);

%% 4b) Grid Search on the number of neighbors
options = [];
options.method_name       = 'Isomap';
options.nbDimensions      = 10;

% neighborsPars = [7 10 13 15 17 20]; 
neighborsPars = [30 35 40 50 60]; 

[ eigenvalues ] = ml_neighbors_grid_search(X',options,neighborsPars);

if exist('h_eig_lap','var')     && isvalid(h_eig_lap),     delete(h_eig_lap);    end
h_eig_lap  = ml_plot_kpca_eigenvalues(eigenvalues,neighborsPars);

%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%         5)  Perform K_means Clustering Dataset             %%
%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% 5a) Perform kmeans clustering on the original data
options = [];
options.method_name                 = 'kmeans';
options.K                           = 5;
[result_orig] = ml_clustering(X',options,'Distance','sqeuclidean');

% Computation of F-measure [0,1]
F_orig = ml_Fmeasure(result_orig.labels,labels);
fprintf('F-measure: %f\n',F_orig);

%% Plot Decision Boundaries for 2D Data
if exist('h4a','var') && isvalid(h4a), delete(h4a);end
result_orig.title       = 'K-Means on Original Data';
result_orig.plot_labels = {'$x_1$','$x_2$'};
h4a = ml_plot_class_boundary(X',result_orig);

%% Plot Clustered Data for >2D Original Data
plot_options             = [];
plot_options.is_eig      = false;
plot_options.points_size = 30;
plot_options.labels      = result_orig.labels;
plot_options.plot_labels = {'$x_1$','$x_2$','$x_3$'};
plot_options.title       = 'Clustered Data on Original Data';

if exist('h1','var') && isvalid(h1), delete(h1);end
h1 = ml_plot_data(X',plot_options);

%% 5b) Perform kmeans clustering on the projected data

%%% Choose between KPCA - LAP - ISO
algo = 'KPCA';
% algo = 'LAP';
% algo = 'ISO';

% Selected Projected Data
if strcmp(algo, 'KPCA')
    Y = proj_KPCA_X(:,[1:2])';
elseif strcmp(algo, 'LAP')
    Y = proj_LAP_X(:,[1:2])';
elseif strcmp(algo, 'ISO')
    Y = proj_ISO_X(:,[1:2])';
end

% K-Means Options
options = [];
options.method_name   = 'kmeans';
options.K             = 6;
[result_proj]         = ml_clustering(Y',options,'Start','plus','Distance','sqeuclidean');

% Computation of F-measure [0,1]
F_proj = ml_Fmeasure(result_proj.labels,labels);
fprintf('F-measure: %f\n',F_proj);

%% Plot Decision Boundaries for 2D Projections
if strcmp(algo, 'KPCA')
    result_proj.title      = 'K ($2$)-Means on Kernel PCA Projected Data';
elseif strcmp(algo, 'LAP')
    result_proj.title      = 'K ($2$)-Means on Laplacian Eigenmaps Projected Data';
elseif strcmp(algo, 'ISO')
    result_proj.title      = 'K ($2$)-Means on Isomap Projected Data';
end
result_proj.plot_labels = {'$y_1$','$y_2$'};
if exist('h4b','var') && isvalid(h4b), delete(h4b);end
h4b = ml_plot_class_boundary(Y',result_proj);

%% Plot Clustered Data for >2D Projections
plot_options             = [];
plot_options.is_eig      = false;
plot_options.points_size = 30;
plot_options.labels      = result_proj.labels;
plot_options.plot_labels = {'$y_1$','$y_2$','$y_3$'};
plot_options.title       = 'Clustered Data on Projections';

if exist('h1','var') && isvalid(h1), delete(h1);end
h1 = ml_plot_data(Y',plot_options);

%% 5c) Perform kmeans Model Selection (RSS, BIC, AIC)

% Selected Projected Data from KPCA/Laplacian/Isomap
if strcmp(algo, 'KPCA')
    Y = proj_KPCA_X(:,[1 2])';
elseif strcmp(algo, 'LAP')
    Y = proj_LAP_X(:,[1 2])';
elseif strcmp(algo, 'ISO')
    Y = proj_ISO_X(:,[1 2])';
end

% Clustering Options
cluster_options             = [];
cluster_options.method_name = 'kmeans';
repeats                     = 10;
Ks                          = 1:10;
[mus, stds]                 = ml_clustering_optimise(Y',Ks,repeats,cluster_options,'Start','plus','Distance','sqeuclidean');

% Plot RSS, AIC and BIC
if exist('h4c','var')&& isvalid(h4c),delete(h4c);    end
plot_options.plot_type = 'together'; % Change to 'together' to see RSS/AIC/BIC in the same plot otherwise 'seperate'
h4c = ml_plot_rss_aic_bic(mus,stds,Ks, plot_options);
