CARS Procedure Vignette (Simulation Setting 1)

Weinan Wang

2019-01-19

This vignette illustrates the basic usage of CARS. For the sake of simplicity, we will consider the Simulation Setting 1 in the referenced paper.

# global parameters
m <- 5000; # number of tests
n_x <- 50; # number of samples for each location in x
n_y <- 60; # number of samples for each location in y
Nrep <- 50; # number of replications
alpha <- 0.05; # nominal level fdr

# next we generate the data matrices for x and y
# for illustrative purposes, we choose k=500
k <- 500;
mu_x_1 <- 5/sqrt(30); # signal strength for first 500 locations of x
mu_y_1 <- 3/sqrt(30); # signal strength for first 500 locations of y
mu_x_2 <- 4/sqrt(30); # signal strength for location 501 to 1000 of x
mu_y_2 <- 4/sqrt(30); # signal strength for location 501 to 1000 of y
sd_x <- 1; # standard deviation for all locations of x
sd_y <- sqrt(2); # standard deviation for all locations of y
theta <- rep(0,m); # the vector for true states of nature
theta[1:k] <- 1; # signal locations: first 500
# initialize the vector of false discovery proportions
FDP <- rep(0,Nrep);
# initialize the vector of missed discovery proportions
MDP <- rep(0,Nrep);

Next, we iterate the CARS procedure and calculate the corresponding FDR and MDR level:

library(CARS)
library(np)

## Nonparametric Kernel Methods for Mixed Datatypes (version 0.60-8)
## [vignette("np_faq",package="np") provides answers to frequently asked questions]
## [vignette("np",package="np") an overview]
## [vignette("entropy_np",package="np") an overview of entropy-based methods]

for (i in 1:Nrep){
    # initialize matrices for x and y
    x <- matrix(rnorm(m*n_x,0,sd_x),m,n_x);
    y <- matrix(rnorm(m*n_y,0,sd_y),m,n_y);

    # fill in data matrices according to Setting 1 (k=500)
    x[c(1:k),] <- x[c(1:k),] + mu_x_1;
    y[c(1:k),] <- y[c(1:k),] + mu_y_1;
    x[c((k+1):(2*k)),] <- x[c((k+1):(2*k)),] + mu_x_2;
    y[c((k+1):(2*k)),] <- y[c((k+1):(2*k)),] + mu_y_2;

    # CARS procedure at level alpha
    decision <- CARS(x,y,alpha,option='regular')$de;

    # false discovery proportion
    FDP[i] <- sum((1-theta)*decision)/max(sum(decision),1);

    # missed discovery proportion
    MDP[i] <- sum(theta*(1-decision))/max(sum(theta),1);
}

#finally we obtain the FDR and MDR levels by averaging the proportions over Nrep=50 replications

FDR <- mean(FDP);
MDR <- mean(MDP);
FDR;

## [1] 0.0523021

MDR;

## [1] 0.77688

Notice the FDR is well controlled under nominal level 0.05.

For the sparse case, we inrease the dimension m and reduce the number of signals k (other parameters are same as above):

# global parameters
m <- 10000; # number of tests
n_x <- 50; # number of samples for each location in x
n_y <- 60; # number of samples for each location in y
Nrep <- 50; # number of replications
alpha <- 0.05; # nominal level fdr


k <- 10;
mu_x_1 <- 5/sqrt(30); # signal strength for first 500 locations of x
mu_y_1 <- 3/sqrt(30); # signal strength for first 500 locations of y
mu_x_2 <- 4/sqrt(30); # signal strength for location 501 to 1000 of x
mu_y_2 <- 4/sqrt(30); # signal strength for location 501 to 1000 of y
sd_x <- 1; # standard deviation for all locations of x
sd_y <- sqrt(2); # standard deviation for all locations of y
theta <- rep(0,m); # the vector for true states of nature
theta[1:k] <- 1; # signal locations: first 500
# initialize the vector of false discovery proportions
FDP <- rep(0,Nrep);
# initialize the vector of missed discovery proportions
MDP <- rep(0,Nrep);

Next, we iterate the CARS procedure and calculate the corresponding FDR and MDR level using the sparse option:

library(CARS)
library(np)
for (i in 1:Nrep){
    # initialize matrices for x and y
    x <- matrix(rnorm(m*n_x,0,sd_x),m,n_x);
    y <- matrix(rnorm(m*n_y,0,sd_y),m,n_y);

    # fill in data matrices according to Setting 1 (k=500)
    x[c(1:k),] <- x[c(1:k),] + mu_x_1;
    y[c(1:k),] <- y[c(1:k),] + mu_y_1;
    x[c((k+1):(2*k)),] <- x[c((k+1):(2*k)),] + mu_x_2;
    y[c((k+1):(2*k)),] <- y[c((k+1):(2*k)),] + mu_y_2;

    # CARS procedure at level alpha
    decision <- CARS(x,y,alpha,option='sparse')$de;

    # false discovery proportion
    FDP[i] <- sum((1-theta)*decision)/max(sum(decision),1);

    # missed discovery proportion
    MDP[i] <- sum(theta*(1-decision))/max(sum(theta),1);
}

#finally we obtain the FDR and MDR levels by averaging the proportions over Nrep=50 replications

FDR <- mean(FDP);
MDR <- mean(MDP);
FDR;

## [1] 0

MDR;

## [1] 0.982