Title: | Kernel Machine Score Test for Semi-Competing Risks |
---|---|
Description: | Kernel Machine Score Test for Pathway Analysis in the Presence of Semi-Competing Risks. Method is detailed in: Neykov, Hejblum & Sinnott (2018) <doi: 10.1177/0962280216653427>. |
Authors: | Matey Neykov [aut], Boris P Hejblum [aut, cre], Jennifer A Sinnot [aut] |
Maintainer: | Boris P Hejblum <[email protected]> |
License: | GPL-2 | file LICENSE |
Version: | 1.0.6 |
Built: | 2025-01-13 05:35:15 UTC |
Source: | https://github.com/borishejblum/kernscr |
Kernel Machine Score Test for Pathway Analysis in the Presence of Semi-Competing Risks
Package: | kernscr |
Type: | Package |
Version: | 1.0.6 |
Date: | 2023-04-17 |
License: | GPL-2 |
The main function of the kernscr package is compute_all_tests
Matey Neykov, Boris P. Hejblum, Jennifer A. Sinnott — Maintainer: Boris P. Hejblum
Neykov M, Hejblum BP, Sinnot JA, Kernel Machine Score Test for Pathway Analysis in the Presence of Semi-Competing Risks, Stat Methods in Med Res, , 27(4): 1099-1114 (2018). <doi: 10.1177/0962280216653427>.
70 pathways from MSigDB c2CP
data("cancer_pathways")
data("cancer_pathways")
a list of 70 relevant pathways from an old version of MSigDB c2CP containing the Entrez IDs.
MJ van de Vijver,YD He, LJ van't Veer, H Dai, AAM Hart, DW Voskuil, A gene-expression signature as a predictor of survival in breast cancer, The New England Journal of Medicine, 347(25):1999-2009, 2002.
T Cai, G Tonini, X Lin, Kernel Machine Approach to Testing the Significance of Multiple Genetic Markers for Risk Prediction, Biometrics, 67(3):975-986, 2011.
data("cancer_pathways") if(interactive()){ ##get the data from Vijver publication #clinical data import_xls_from_zip <- function(urlPath, filename, zipname, skip=0){ zipFile <- paste0(zipname, ".zip") download.file(paste0(urlPath, zipFile), zipFile) unzip(zipFile, exdir="./temp_unzip") xlsFile <- paste0("./temp_unzip/", filename, ".xls") res <- readxl::read_xls(xlsFile, skip=skip) unlink(zipFile) unlink("./temp_unzip", recursive=TRUE) return(res) } BC_dat_clin <- import_xls_from_zip2(urlPath="http://ccb.nki.nl/data/", filename="Table1_ClinicalData_Table", zipname="nejm_table1", skip=2 ) BC_dat_clin <- BC_dat_clin[order(BC_dat_clin$SampleID), ] col2rmv <- 1:ncol(BC_dat_clin) BC_dat_clin$ID <- paste0("S", BC_dat_clin$SampleID) rownames(BC_dat_clin) <- BC_dat_clin$ID BC_dat_clin$evdeath <- BC_dat_clin$EVENTdeath BC_dat_clin$tsurv <- BC_dat_clin$TIMEsurvival BC_dat_clin$evmeta <- BC_dat_clin$EVENTmeta BC_dat_clin$tmeta<- pmin(BC_dat_clin$TIMEsurvival, BC_dat_clin$TIMEmeta, na.rm=TRUE) samples2rmv <- c("S28", "S122", "S123", "S124", "S133", "S138", "S139", "S141", "S221", "S222", "S224", "S226", "S227", "S228", "S229", "S230", "S231", "S237", "S238", "S240", "S241", "S248", "S250", "S251", "S252", "S254", "S292", "S317", "S342", "S371", "S379", "S380", "S397", "S398", "S401") BC_dat_clin <- BC_dat_clin[-which(BC_dat_clin$ID %in% samples2rmv), -col2rmv] head(BC_dat_clin) #import genomics data urlPath="http://ccb.nki.nl/data/" zipFile <- paste0("ZipFiles295Samples", ".zip") download.file(paste0(urlPath, zipFile), zipFile) unzip(zipFile, exdir="./temp_unzip") unlink(zipFile) unlink("./temp_unzip/Readme.txt", recursive=FALSE) txtfiles <- list.files("./temp_unzip/") BC_dat_exp <- NULL for(f in txtfiles){ temp_exp <- read.delim(paste0("./temp_unzip/", f)) if(f==txtfiles[1]){ gene_id <- as.character(temp_exp[-1, 1]) gene_symbol <- as.character(temp_exp[-1, 2]) } temp_exp <- temp_exp[-1, grep("Sample.", colnames(temp_exp))] colnames(temp_exp) <- gsub("Sample.", "S", colnames(temp_exp)) if(f==txtfiles[1]){ BC_dat_exp <- temp_exp }else{ BC_dat_exp <- cbind(BC_dat_exp, temp_exp) } } BC_dat_exp_all <- cbind.data.frame("SYMBOL"=gene_symbol, BC_dat_exp[, BC_dat_clin$ID]) unlink("./temp_unzip", recursive=TRUE) # translating the pathways from Entrez ID to gene symbol if (requireNamespace("org.Hs.eg.db", quietly = TRUE)){ library(org.Hs.eg.db) x <- org.Hs.egSYMBOL mapped_genes <- mappedkeys(x) xx <- as.list(x[mapped_genes]) cancer_pathways_Symbol <- lapply(cancer_pathways, function(v){unlist(xx[v])}) sapply(cancer_pathways, function(x){length(intersect(x, rownames(BC_dat_exp)))/length(x)}) } }
data("cancer_pathways") if(interactive()){ ##get the data from Vijver publication #clinical data import_xls_from_zip <- function(urlPath, filename, zipname, skip=0){ zipFile <- paste0(zipname, ".zip") download.file(paste0(urlPath, zipFile), zipFile) unzip(zipFile, exdir="./temp_unzip") xlsFile <- paste0("./temp_unzip/", filename, ".xls") res <- readxl::read_xls(xlsFile, skip=skip) unlink(zipFile) unlink("./temp_unzip", recursive=TRUE) return(res) } BC_dat_clin <- import_xls_from_zip2(urlPath="http://ccb.nki.nl/data/", filename="Table1_ClinicalData_Table", zipname="nejm_table1", skip=2 ) BC_dat_clin <- BC_dat_clin[order(BC_dat_clin$SampleID), ] col2rmv <- 1:ncol(BC_dat_clin) BC_dat_clin$ID <- paste0("S", BC_dat_clin$SampleID) rownames(BC_dat_clin) <- BC_dat_clin$ID BC_dat_clin$evdeath <- BC_dat_clin$EVENTdeath BC_dat_clin$tsurv <- BC_dat_clin$TIMEsurvival BC_dat_clin$evmeta <- BC_dat_clin$EVENTmeta BC_dat_clin$tmeta<- pmin(BC_dat_clin$TIMEsurvival, BC_dat_clin$TIMEmeta, na.rm=TRUE) samples2rmv <- c("S28", "S122", "S123", "S124", "S133", "S138", "S139", "S141", "S221", "S222", "S224", "S226", "S227", "S228", "S229", "S230", "S231", "S237", "S238", "S240", "S241", "S248", "S250", "S251", "S252", "S254", "S292", "S317", "S342", "S371", "S379", "S380", "S397", "S398", "S401") BC_dat_clin <- BC_dat_clin[-which(BC_dat_clin$ID %in% samples2rmv), -col2rmv] head(BC_dat_clin) #import genomics data urlPath="http://ccb.nki.nl/data/" zipFile <- paste0("ZipFiles295Samples", ".zip") download.file(paste0(urlPath, zipFile), zipFile) unzip(zipFile, exdir="./temp_unzip") unlink(zipFile) unlink("./temp_unzip/Readme.txt", recursive=FALSE) txtfiles <- list.files("./temp_unzip/") BC_dat_exp <- NULL for(f in txtfiles){ temp_exp <- read.delim(paste0("./temp_unzip/", f)) if(f==txtfiles[1]){ gene_id <- as.character(temp_exp[-1, 1]) gene_symbol <- as.character(temp_exp[-1, 2]) } temp_exp <- temp_exp[-1, grep("Sample.", colnames(temp_exp))] colnames(temp_exp) <- gsub("Sample.", "S", colnames(temp_exp)) if(f==txtfiles[1]){ BC_dat_exp <- temp_exp }else{ BC_dat_exp <- cbind(BC_dat_exp, temp_exp) } } BC_dat_exp_all <- cbind.data.frame("SYMBOL"=gene_symbol, BC_dat_exp[, BC_dat_clin$ID]) unlink("./temp_unzip", recursive=TRUE) # translating the pathways from Entrez ID to gene symbol if (requireNamespace("org.Hs.eg.db", quietly = TRUE)){ library(org.Hs.eg.db) x <- org.Hs.egSYMBOL mapped_genes <- mappedkeys(x) xx <- as.list(x[mapped_genes]) cancer_pathways_Symbol <- lapply(cancer_pathways, function(v){unlist(xx[v])}) sapply(cancer_pathways, function(x){length(intersect(x, rownames(BC_dat_exp)))/length(x)}) } }
This functions computes p-values frm
score tests of genetic pathway risk association in 5 different models
compute_all_tests( data, ind_gene = 7:ncol(data), num_perts = 1000, Ws = NULL, rho = NA, kernel = c("linear", "gaussian", "poly"), d = 2, pca_thres = 0.9, get_ptb_pvals = FALSE, ... )
compute_all_tests( data, ind_gene = 7:ncol(data), num_perts = 1000, Ws = NULL, rho = NA, kernel = c("linear", "gaussian", "poly"), d = 2, pca_thres = 0.9, get_ptb_pvals = FALSE, ... )
data |
a
|
ind_gene |
columns indices of genes in the pathway of interest. Default is |
num_perts |
number of perturbations used. Default is |
Ws |
optional inputed perturbations, should be a vector of length |
rho |
a vector of rhos, such as one found created from the range returned by |
kernel |
a character string indicating which kernel is used. Possible values (currently implemented) are
|
d |
if |
pca_thres |
a number between |
get_ptb_pvals |
a logical flag indicating whether perturbed p-values should be returned
as part of the results. Default is |
... |
extra parameters to be passed to a user-defined kernel. |
either a vector
of p-values for 5 different models with names:
"SCR"
: Semi-Competing Risks
"PFS"
: Progression Free Survival
"CR"
: Competing Risks
"OS"
: Overall Survival
"SCR_alt"
: SCR allowing different tuning parameters for the two event time processes
or else if get_ptb_pvals
is TRUE
, a list
with 2 elements:
"obs_pvals"
: a vector containing the observed p-values for each of the 5 models as described above
"null_pvals_perts"
: a matrix of dimensions num_perts x 5
containing the corresponding
perturbed p-values
Neykov M, Hejblum BP, Sinnot JA, Kernel Machine Score Test for Pathway Analysis in the Presence of Semi-Competing Risks, submitted, 2016.
## First generate some Data feat_m_fun <- function(X){ sin(X[,1]+X[,2]^2)-1 } feat_d_fun <- function(X){ (X[,4]-X[,5])^2/8 } mydata <- sim_SCR_data(data_size = 400, ncol_gene_mat = 20, feat_m = feat_m_fun, feat_d = feat_d_fun, mu_cen = 40, cov=0.5) #initial range ind_gene <- c(7:ncol(mydata)) my_rho_init <- seq(0.01, 20, length=300)*length(ind_gene) range(my_rho_init) if(interactive()){ # compute the interval for rho rho_set <- findRhoInterval(tZ=t(mydata[,ind_gene]), rho_init = my_rho_init, kernel="gaussian") rho_set range(my_rho_init) # good to check that the interval produced here is strictly contained in rho_init # otherwise, expand rho.init and rerun rhos <- exp(seq(log(rho_set[1]),log(rho_set[2]), length=50)) # run the tests with Gaussian kernel compute_all_tests(data = mydata, num_perts=1000, rho=rhos, kernel="gaussian") # run the tests with linear kernel compute_all_tests(data=mydata, num_perts=1000, kernel="linear") }
## First generate some Data feat_m_fun <- function(X){ sin(X[,1]+X[,2]^2)-1 } feat_d_fun <- function(X){ (X[,4]-X[,5])^2/8 } mydata <- sim_SCR_data(data_size = 400, ncol_gene_mat = 20, feat_m = feat_m_fun, feat_d = feat_d_fun, mu_cen = 40, cov=0.5) #initial range ind_gene <- c(7:ncol(mydata)) my_rho_init <- seq(0.01, 20, length=300)*length(ind_gene) range(my_rho_init) if(interactive()){ # compute the interval for rho rho_set <- findRhoInterval(tZ=t(mydata[,ind_gene]), rho_init = my_rho_init, kernel="gaussian") rho_set range(my_rho_init) # good to check that the interval produced here is strictly contained in rho_init # otherwise, expand rho.init and rerun rhos <- exp(seq(log(rho_set[1]),log(rho_set[2]), length=50)) # run the tests with Gaussian kernel compute_all_tests(data = mydata, num_perts=1000, rho=rhos, kernel="gaussian") # run the tests with linear kernel compute_all_tests(data=mydata, num_perts=1000, kernel="linear") }
Find an interval constraining the rho parameter for a non linear kernel
findRhoInterval( tZ, rho_init = seq(0.01, 20, length = 300) * nrow(tZ), kernel = c("gaussian", "poly"), d = NA, rate_range = c(1.5, 4), pca_thres = 0.9, warning_suppress = TRUE )
findRhoInterval( tZ, rho_init = seq(0.01, 20, length = 300) * nrow(tZ), kernel = c("gaussian", "poly"), d = NA, rate_range = c(1.5, 4), pca_thres = 0.9, warning_suppress = TRUE )
tZ |
a |
rho_init |
an initial large range of possible rhos, which will be considered to see if they are
reasonable tuning parameters for the kernel. Default is |
kernel |
character string specifying a nonlinear kernel. Currently supported options are:
|
d |
if |
rate_range |
a vector of length 2 indicating the range of alpha in the paper. Default is |
pca_thres |
a number between |
warning_suppress |
logical flag. Indicating whether the warnings should be suppress during
the linear model fitting step. Default is |
This function will print rho_init
range and the range of valid tuning parameters.
If that range butts up against either the upper or lower bound of rho_init
, you can rerun this function
with a bigger rho_init
.
Finding the right tuning parameters includes a step of fitting a linear model which can fail
because some tuning parameters yield only one eigenvector. We want to eliminate those tuning parameters,
so this is OK. However, in case one want to suppress (numerous) annoying warning messages, use the
warning_suppress
argument.
an upper and lower bound to look for rho
## First generate some Data feat_m_fun <- function(X){ sin(X[,1]+X[,2]^2)-1 } feat_d_fun <- function(X){ (X[,4]-X[,5])^2/8 } mydata <- sim_SCR_data(data_size = 400, ncol_gene_mat = 20, feat_m = feat_m_fun, feat_d = feat_d_fun, mu_cen = 30, cov=0.5) #initial range ind_gene <- c(7:ncol(mydata)) my_rho_init <- seq(0.01, 20, length=300)*length(ind_gene) range(my_rho_init) if(interactive()){ # compute the interval for rho rho_set <- findRhoInterval(tZ=t(mydata[,ind_gene]), rho_init = my_rho_init, kernel="gaussian") rho_set range(my_rho_init) # good to check that the interval produced here is strictly contained in rho_init # otherwise, expand rho.init and rerun #rhos <- exp(seq(log(rho_set[1]),log(rho_set[2]), length=50)) }
## First generate some Data feat_m_fun <- function(X){ sin(X[,1]+X[,2]^2)-1 } feat_d_fun <- function(X){ (X[,4]-X[,5])^2/8 } mydata <- sim_SCR_data(data_size = 400, ncol_gene_mat = 20, feat_m = feat_m_fun, feat_d = feat_d_fun, mu_cen = 30, cov=0.5) #initial range ind_gene <- c(7:ncol(mydata)) my_rho_init <- seq(0.01, 20, length=300)*length(ind_gene) range(my_rho_init) if(interactive()){ # compute the interval for rho rho_set <- findRhoInterval(tZ=t(mydata[,ind_gene]), rho_init = my_rho_init, kernel="gaussian") rho_set range(my_rho_init) # good to check that the interval produced here is strictly contained in rho_init # otherwise, expand rho.init and rerun #rhos <- exp(seq(log(rho_set[1]),log(rho_set[2]), length=50)) }
Data Simulation Function
sim_SCR_data( data_size, ncol_gene_mat, feat_m, feat_d, mu_cen, cov, lam_m = 1/15, lam_d = 1/20, norm_vcov = c(1, 0.5, 0.5, 1) )
sim_SCR_data( data_size, ncol_gene_mat, feat_m, feat_d, mu_cen, cov, lam_m = 1/15, lam_d = 1/20, norm_vcov = c(1, 0.5, 0.5, 1) )
data_size |
an integer giving the simulated sample size |
ncol_gene_mat |
an integer giving the simulated number of genomic covariates |
feat_m |
a function that transforms the genomic features into the signal for the metastasis
process. This function should a matrix of dimensions |
feat_d |
a function that transforms the genomic features into the signal for the death
process. This function should a matrix of dimensions |
mu_cen |
mean of the exponential censoring process |
cov |
the correlation between the genomic covariates |
lam_m |
baseline hazard constant for metastasis process. Default is |
lam_d |
baseline hazard constant for death process. Default is |
norm_vcov |
vector of length 4 of correlation between errors between the two processes on
the normal scale before being complementary-log-log-transformed. Default is |
a data.frame
with columns:
XR
: time to recurrence / death / censoring
XD
: time to death / censoring
DeltaR
: Indicator of censoring (0), recurrence (1), or death (2) for this earliest time XR
DeltaD
: Indicator of censoring (0) or death (1)
XPFS
: time to recurrence / death / censoring (=XR
)
DeltaPFS
: Indicator of censoring (0) or recurrence or death, whichever came first (1)
Z_1,...,Z_P
: genomic variables
feat_m_fun <- function(X){ sin(X[,1]+X[,2]^2)-1 } feat_d_fun <- function(X){ (X[,4]-X[,5])^2/8 } mydata <- sim_SCR_data(data_size = 400, ncol_gene_mat = 20, feat_m = feat_m_fun, feat_d = feat_d_fun, mu_cen = 30, cov=0.5) head(mydata) ## how many experience both events mean(mydata[,"DeltaR"]==1 & mydata[,"DeltaD"]==1) ## how many only recur mean(mydata[,"DeltaR"]==1 & mydata[,"DeltaD"]==0) ## how many only die mean(mydata[,"DeltaR"]==2 & mydata[,"DeltaD"]==1) ## how many are censored mean(mydata[,"DeltaR"]==0 & mydata[,"DeltaD"]==0)
feat_m_fun <- function(X){ sin(X[,1]+X[,2]^2)-1 } feat_d_fun <- function(X){ (X[,4]-X[,5])^2/8 } mydata <- sim_SCR_data(data_size = 400, ncol_gene_mat = 20, feat_m = feat_m_fun, feat_d = feat_d_fun, mu_cen = 30, cov=0.5) head(mydata) ## how many experience both events mean(mydata[,"DeltaR"]==1 & mydata[,"DeltaD"]==1) ## how many only recur mean(mydata[,"DeltaR"]==1 & mydata[,"DeltaD"]==0) ## how many only die mean(mydata[,"DeltaR"]==2 & mydata[,"DeltaD"]==1) ## how many are censored mean(mydata[,"DeltaR"]==0 & mydata[,"DeltaD"]==0)