% -o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-
%               SAMPLE CODE FOR THE PERFORMANCE PREDICTION CHALLENGE
%             ADAPTED FOR THE FEATURE EXTRACTION COURSE WS 2005/06
%             Isabelle Guyon -- isabelle@clopinet.com -- April 2006
% -o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-

% DISCLAIMER: ALL INFORMATION, SOFTWARE, DOCUMENTATION, AND DATA ARE PROVIDED "AS-IS" 
% ISABELLE GUYON AND/OR OTHER ORGANIZERS DISCLAIM ANY EXPRESSED OR IMPLIED WARRANTIES, 
% INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
% FOR ANY PARTICULAR PURPOSE, AND THE WARRANTY OF NON-INFRIGEMENT OF ANY THIRD PARTY'S 
% INTELLECTUAL PROPERTY RIGHTS. IN NO EVENT SHALL ISABELLE GUYON AND/OR OTHER ORGANIZERS 
% BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
% ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF SOFTWARE, DOCUMENTS, 
% MATERIALS, PUBLICATIONS, OR INFORMATION MADE AVAILABLE FOR THE CHALLENGE. 
%
% ---------
% Tested and modified by: 
% Gideon Dror, gideon@mta.ac.il
% Amir Reza Saffari Azar, amir@ymer.org.
% September 2005 - April 2006
% ---------

clear all

% -o-|-o-|-o-|-o-|-o-|-o-|-o- BEGIN USER-PREFERENCES -o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-

% 1) User-defined directories (no slash at the end of the names):
% --------------------------------------------------------------
% The present set up supposes that there are 2 directories of my_root
% Data
% Clop
% containing the data and the CLOP package.
% If they do not yet exist 3 directories are created in my_root
% Results
% Zipped
% Models
% holding the model predictions, zipped files ready to submit to the
% challenge web site, and the CLOP models.

my_root     = pwd;                  % Change that to the directory of your project

data_dir    = [my_root '/Data'];    %'C:\Users\Isabelle\Projects\ETH\Data\'; % 
                                    % Path to the five data directories downloaded                                     
                                    % from http://clopinet.com/isabelle/Projects/NIPS2003/
                                    % or http://www.nipsfsc.ecs.soton.ac.uk/datasets/
                                    % (ARCENE, DEXTER, DOROTHEA, GISETTE, and MADELON ).
resu_dir    = [my_root '/Results']; % Where the results will end up.    
zip_dir     = [my_root '/Zipped'];  % Zipped files with results ready to go!
model_dir   = [my_root '/Models'];  % Where the trained models will end up.
                                    % Note: you need not include the models in
                                    % your submissions, unless you want to
                                    % qulify for a bonus entry. In this
                                    % example, the models are save in full
                                    % in the model_dir directory and a lean
                                    % version (with the hyperparameters and
                                    % the smallest trained entities) is
                                    % save in the resu_dir directory.
                                    
ForceOverWrite = 1;                 % Change this value to 0 if you want to be warned when
                                    % a file already exists before saving a result or model.

code_dir    =  [my_root '/Clop'];   %  Path to the sample code or the 
                                    % Challenge Learning Objects Package (CLOP).

% 2) Choose your data and models
% ------------------------------
dataset={'arcene', 'dexter', 'dorothea', 'gisette', 'madelon'}; 
modelset    = {'baseline'};         % You can create your own models by making
                                    % your own chains.
                                    % This should be an array of model example names
                                    % e.g. {'zarbi', 'Prepro+linearSVC'}
                                    % If you leave this array is empty, it will 
                                    % be replaced by a default list of model names.                              
                                    % For a list of model examples, use:
                                    % > model_examples

global UsePixelRep;
UsePixelRep=0;                    % Use pixel representation (for Gisette)
DoNotLoadTestData=1;              % To save memory, does not load the test data
%In this example script, we set MergeDataSets to 1 for all datasets by Dorothea. 
MergeDataSets=1;                  % If this flag is zero, training is done on the
                                  % training data only. Otherwise training
                                  % and validation data are merged. 
FoldNum=0;                        % If this flag is positive, 
                                  % k-fold cross-validation is performed.
                                  % with k=FoldNum.
% Note: for compatibility reasons, even if training and validation sets
% were merged for training, they are tested separatly at the end so the
% results can be submitted to the challenge web site.
% -o-|-o-|-o-|-o-|-o-|-o-|-o- END USER-PREFERENCES -o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-
fprintf('\n-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-\n');

% Set the path and defaults properly; create directories
% ------------------------------------------------------
addpath(code_dir);
if exist('use_spider_clop.m') == 2, 
    use_spider_clop(code_dir);
elseif exist('code_version.m') == 2, 
    fprintf('Sample code version : %s\n', code_version('sample_code'));
else disp 'ERROR: Wrong code path. Check your directories and path variables.';
    if exist('README.txt') == 2, type README.txt; end
    if exist('Data/README.txt') == 2, type Data/README.txt; end
    if exist('Clop/README.txt') == 2, type Clop/README.txt; end
    return; 
end
if isempty(modelset), modelset =  model_examples; end
makedir(resu_dir);
makedir(zip_dir);
makedir(model_dir);

% LOOP OVER DATASETS 
% ==================
for k = 1:length(dataset)
    
	data_name   = dataset{k};
    if strcmp(data_name, 'dorothea')
        MergeDataSets=0; 
        % Do not to train on validation data for Dorothea
    else
        MergeDataSets=1;
    end
    
    fprintf('\n-o-|-o-|-o-|-o-|-o-|-o-      %s      -o-|-o-|-o-|-o-|-o-|-o-\n', upper(data_name));
    fprintf('\n-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-\n\n');
    
    % Create a data structure and check the data
    %===========================================
    fprintf('-- %s loading data --\n', upper(data_name));
    input_dir   = [data_dir '/' upper(data_name)];
    input_name  = [input_dir '/' data_name];
    
    D   = create_data_struct(input_name, DoNotLoadTestData);
    % Each member of D (D.train, D.valid, and D.test) 
    % is a data object (a structure) with members X and Y (if Y is given). 
    
    % New: compute data statistics
    data_stats(D);
    
    % Special for Gisette:
    if (UsePixelRep) & (strcmp(data_name , 'gisette'))
        D = pixel_rep(D, input_name);
    end
    
    fprintf('-- %s data loaded --\n', upper(data_name));
    % Note: the data are saved as a Matlab structure 
    % so they will load faster the second time around. 

    % LOOP OVER MODELS 
    % ================
    for j = 1:length(modelset)
        
        model_name  = modelset{j};
        
        % Select some features
        %=====================
        
        % Build a model
        %==============
        fprintf('-- %s-%s building  model\n', upper(data_name), upper(model_name));
        my_model    = model_examples(model_name, data_name);
        % Note: "model_examples" calls a model constructor and returns a learning object.
        % Enter at the prompt "> type model_examples" to view the examples.
        % All learning objects have the two methods "train" and "test".
        % To see the data members, type at the prompt "> struct(my_model)".
        fprintf('-- %s-%s model built\n', upper(data_name), upper(model_name));
        
        % Train the model
        %================
        if MergeDataSets
            % Get rid of the validation set
            D.train=data([D.train.X;D.valid.X], [D.train.Y;D.valid.Y]);
            rmfield(D, 'valid');
        end
        
        if FoldNum>0
            % Create a CV model
            cv_model=cv(my_model, {['folds=' num2str(FoldNum)], 'store_all=0'});
            fprintf('-- %s-%s performing %d fold cross-validation\n', upper(data_name), upper(model_name), FoldNum);
            % Call the method "train" of the object "cv_model":
            cv_output = train(cv_model, D.train); 
            % Collect the results
            OutX=[]; OutY=[]; ber=[];
            for kk=1:FoldNum, 
                outX=cv_output.child{kk}.X;
                outY=cv_output.child{kk}.Y;
                OutX=[OutX; outX]; 
                OutY=[OutY; outY]; 
                ber(kk)=balanced_errate(outX, outY);
            end
            cvber=balanced_errate(OutX, OutY);
            ebar=std(ber,1)/sqrt(FoldNum);
            fprintf('CV BER=%5.2f+-%5.2f%%\n', 100*cvber, 100*ebar); 
            fprintf('-- %s-%s cross-validation done in %5.2f seconds\n', upper(data_name), upper(model_name), toc);
            fprintf('-- %s-%s training model on all training data\n', upper(data_name), upper(model_name));
        end
        
        % Call the method "train" of the object "my_model":
        tic;
        [train_output my_model] = train(my_model, D.train);  
        ber = balanced_errate(train_output.X, train_output.Y);
        ebar = error_bar(ber, length(find(train_output.Y>0)));
        fprintf('TRAIN BER=%5.2f +-%5.2f%%\n', 100*ber, 100*ebar); 
        fprintf('-- %s-%s model trained in %5.2f seconds\n', upper(data_name), upper(model_name), toc);

        % Test the model
        %===============
        if DoNotLoadTestData | MergeDataSets
            % Now (re-)load everything
            D = create_data_struct(input_name);
            % Special for homework 4:
            if (UsePixelRep) & (strcmp(data_name , 'gisette'))
                D = pixel_rep(D, input_name);
            end
        end
        % Note: in order to be able to make valid challenge
        % submissions, here we keep the original data split.
        % Thus the training error will differ from the
        % error rate on all the training data and the validation
        % error will be similar to the training error (since the
        % validation data was used for training.)
        tic;
        fprintf('-- %s-%s testing model\n', upper(data_name), upper(model_name));
        set_name    = fieldnames(D); % 'train', 'valid' and 'test'
        
        for i = 1:length(set_name) % Loop over training, validation, and test set
            % Call the method "test" of the object "my_model" 
            % on D.train, D.valid, or D.test:
            output  = test(my_model, D.(set_name{i})); 
            discriminant{i} = output.X; % The "test" method returns the predicted discriminant values in output.X
            target = output.Y; % If the targets were provided in the dataset, they are copied in output.Y
            if ~isempty(target)
                ber = balanced_errate(discriminant{i},target);
                ebar = error_bar(ber, length(find(target>0)));
                fprintf('%s, BER=%5.2f +-%5.2f%%\n', set_name{i}, 100*ber, 100*ebar); 
            end
        end       
        fprintf('-- %s-%s model tested in %5.2f seconds\n', upper(data_name), upper(model_name), toc);  
        
        % Save the results
        %=================
        fprintf('-- %s-%s saving the results\n', upper(data_name), upper(model_name));
        resu_name   = [resu_dir '/' model_name ];
        makedir(resu_name);
        resu_name   = [resu_name '/' data_name]; 
        % Loop over 'train', 'valid', and 'test'
        for i=1:length(set_name) 
            save_outputs([[resu_name '_' set_name{i}] '.resu'], sign(discriminant{i}), ForceOverWrite);        
            save_outputs([[resu_name '_' set_name{i}] '.conf'], abs(discriminant{i}), ForceOverWrite);        
        end
        % Save the indices of the features
        idx_feat=get_fidx(my_model);
        save_outputs([resu_name '.feat'], idx_feat, ForceOverWrite);
        fprintf('-- %s-%s results saved as %s*\n', upper(data_name), upper(model_name), resu_name);
        
        % Save the models
        %================
        fprintf('-- %s-%s saving the models\n', upper(data_name), upper(model_name));
        % Remove the biggest structures and arrays to make the model smaller
        % Save the lean model in the results directory to qualify for a "bonus" entry:
        my_lean_model=clean(my_model); 
        mresu_name  = [resu_dir '/' model_name '/' data_name '_model'];
        save_model(mresu_name, my_lean_model, ForceOverWrite);
        % Save also the whole trained model for further reference:
        mresu_name  = [model_dir '/' model_name ];
        makedir(mresu_name);
        mresu_name  = [mresu_name '/' data_name '_model'];
        save_model(mresu_name, my_model, ForceOverWrite);
        fprintf('-- %s-%s models saved as %s*\n', upper(data_name), upper(model_name), mresu_name);
        fprintf('\n-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-|-o-\n');
    
    end % Loop over models
end % Loop over datasets

% Zip the archives so they are ready to go!
% -----------------------------------------
if ~usejava('jvm'), warning('Java is not loaded, failed to generate ZIP files !!!'); return; end
for k = 1:length(modelset)
    model_name  = modelset{k};
    zip_name    = zipall(model_name, resu_dir, zip_dir);
    if ~isempty(zip_name)
        fprintf('-- %s zip archive created, see %s --\n', upper(model_name), zip_name);
    end
end

% Score the models
fprintf('-- %s scoring the models --\n');
simple_score;