function [V, S_unscaled, depth, S_scaled] = Func_Sparsity_Depth(X, y, B_target, thresholdingType, q, lossType, zetaGrid, optsBCD, optsV, optsS, rho_depth)
%==========================================================================
% This function computes the Theta^# sparsity depth of a given B (a vector)
%------------------------ Input Variables ---------------------------------
% X               - design matrix
% y               - response vector 
% B_target        - the point of interest to calculate the sparsity depth
%                   (assumed to be a vector in the implementation)
% thresholdingType- regularization type for computing the depth (can be either a penalty or a constraint)
% q               - regularization parameter value. It must be specified. In the L0-constraint case (or quantile thresholding), 
%                   this parameter just gives the pre-specified model cardinality.
% lossType        - loss function (not necessarily the quadratic loss)
% zetaGrid        - the grid of the anealing parameters
% optsBCD         - options in calling the BCD for jointly optimizing V and s
% optsV           - Options for optimizing V
% optsS           - Options for optimizing S
% rho_depth       - a scale parameter aded in the objective function
%                   (Theoretically it should show no difference for L0 sparsity 
%                   depth, but practically, due to our approximation scheme, a good
%                   empirical choice can lead to smaller objective function
%                   values; see the note below.)
%------------------------ Output Variables ---------------------------------
% V               - solution of V 
% S_unscaled      - solution of S (for the objective with rho_depth=1)
% S_scaled        - solution of S for the scaled objective using rho_depth
%                   (S_scaled * rho_depth == S_unscaled)
% depth 	      - the depth value at convergence
%==========================================================================
debug = 0;
%==========================================================================
% Specify some parameters for saprsity-depth computation
p = size(X, 2);
if ~exist('rho_depth', 'var') ||  isempty(rho_depth)
    % Make a default choice of rho
    rho_depth = 1e-4 * norm(X, 2)^2; %0.025; %0.05;
    % Note: Theoretically, the 0-1 depth should be invariant to rho_depth, but
    % because we use a progressive approximation scheme, rho_depth shows
    % some numerical differences. Experience shows that this default choice
    % often leads to a small depth during the optimization (if not the
    % smallest)
end


if isempty(optsBCD)
    optsBCD.maxiter = 100;
    optsBCD.tolgradnorm = 1e-2;
end
if isempty(optsV)
    optsV.gradnorm = 1e-4;
    optsV.verbosity = 0;
    optsV.Ninisample = 10;
    optsV.method = 'trustregion';
    optsV.usehess = 'true';
    %optsV.checkgrad = 'true';
    optsV.checkhess = 'false';
end
if isempty(optsS)
    optsS.optTol = 1e-12;
    optsS.verbose = 0;
    optsS.maxIter = 50;
end
if isempty(zetaGrid)
    zetaGrid = 1.25.^linspace(0, 10, 11);
end

%==========================================================================   
switch thresholdingType
    case 'quantile' % This corresponds to using an l0 constraint
        gamma = zeros(p, 1);
    otherwise % 'hard', 'soft', 'scad'    
        error('Not implemented yet');
end
% Find the lamda value as the constraint for the inf-norm of s
[lambda] = Func_lambda_Sparsity_Depth_ThetaSharp(X, y, rho_depth, B_target, lossType);


Jc = B_target ==0; % determine the components in B_target is 0 or not, if not 0, the correspond s will be 0 in BCD optimization
LB = -lambda * Jc; % the lower bound for s in BCD optimization
UB = lambda * Jc;  % the upper bound for s in BCD optimization

R = Funct_GLMResidual_Gradient(y, X, B_target, lossType);
Xt = X.';

gridsize = length(zetaGrid);
if sum(abs(sort(zetaGrid, 'ascend') - zetaGrid))>0
    zetaGrid = unique(zetaGrid);
    gridsize = length(zetaGrid);
    warning('Please provide a zetaGrid with strictly increasing steepnees values.')
end

if (q / size(X,2)) <= 0.2
    S_init = lambda * (rand(p,1)-1);
else
    % q relative large: the depth is often closer to the regresion depth
    S_init = zeros(p, 1); %lambda*(rand(p,1)-1); %lambda*(rand(p,1)-1); %lambda * ones(p,1); %(1 / rho_depth) * X' * (X * B_target - y); %zeros(p,1);
end
for i = 1:gridsize
    zeta = zetaGrid(i);
    [~, m] = size(R);
    if i == 1
        %% if i equales to 1, we initial V and S many times to do the optimization
        NumInits = optsV.Ninisample;
        Vres = zeros(p, m, NumInits); 
        Sres = cell(NumInits, 1);
        fres = zeros(NumInits, 1);
        [Vs_init] = Funct_InitVs(Xt, R, NumInits);
        for j = 1:NumInits
            V_init = Vs_init(:, :, j);
            [Vres(:,:,j), Sres{j}, fres(j)] = Func_Sparsity_Depth_InAnn(X, Xt, V_init, S_init, R, gamma, zeta, LB, UB, optsBCD, optsV, optsS, rho_depth);
        end
        optBInd = find(fres == min(fres), 1);
        V = Vres(:,:, optBInd);
        S = Sres{optBInd};
        depth = fres(optBInd);
    else
        %% if i larger than 1, we use warm start to do the remaining optimizaitons
        V_init = V;  % Warmstarts for V and S
        S_init = S;
        [V, S, depth] = Func_Sparsity_Depth_InAnn(X, Xt, V_init, S_init, R, gamma, zeta, LB, UB, optsBCD, optsV, optsS, rho_depth);
    end

    if debug > 0
        fprintf('Steepness zeta=%d finished, depth=%.3f \n', zeta, depth);
    end
end

S_scaled = S;
S_unscaled = S * rho_depth;
end






function [V, S, depth] = Func_Sparsity_Depth_InAnn(X, Xt, V_init, S_init, R, gamma, zeta, LB, UB, optsBCD, optsV, optsS, rho_depth)
maxiter = optsBCD.maxiter;
V = V_init;
S = S_init;
n = size(X,1);
debug = 0;
%% Use BCD to optimize S and V
for i = 1: maxiter
    %% Fix S, solve V
    Shift =  ones(n,1)*(gamma - S).'/n;
    ShiftT = Shift.';
    [V, ~, info] = aux_sparsity_depth_BCD_V(X, Xt, V, R, Shift, ShiftT, zeta, optsV, rho_depth);

    
    %% Fix V, solve S
    Vt = V.';
    M = (1 / rho_depth) * (X * V).*R + Vt * gamma * ones(n,1)/n;
    U = -ones(n,1) * Vt / n;
    [S, ~,gnorm] = aux_sparsity_depth_BCD_S(M, U, S, zeta, LB, UB, optsS);

    
    gradV = info.gradnorm;

    if (gradV(end) < optsBCD.tolgradnorm) && (gnorm < optsBCD.tolgradnorm) && debug
        fprintf('Gradient norm less than optsBCD.tolgradnorm\n');
        break
    end
end
%==========================================================================
%The 0-1 depth is computed according to  1_{>=0} as the measure 
Xdiag =  M + U * S;
depth = sum(Xdiag >= 0)/size(X,1);
end

function [lambda] = Func_lambda_Sparsity_Depth_ThetaSharp(X, y, rho_depth, B_target, lossType)
% This function is to provide the value of lambda (the constraint parameter
% for ||s||_\infty) in Theta^#-depth (corresponding to the l0 constraint), which changes with B_target
[~, g] = Funct_GLMLoss_Gradient(B_target, X, y, 0, lossType); % g = (1 / rho_depth) * X' * (X * B_target - y); for gaussian loss (regression case)
g = (1 / rho_depth) * g;
% Assume B_target is a sparse vector in Theta^$ depth
g_Jc = g(B_target == 0);
lambda = max(abs(g_Jc));      
end
