write_predict_db_checks

Author

Saideep Gona

Published

February 26, 2024

Context

We have been having some difficulties running Predict-DB, so here I am going to debug my latest issues and make a more formal set of pre-checks everyone can use.

Functions

Code
get_filtered_snp_annot <- function(snp_annot_file_name) {
  snp_annot <- read.table(snp_annot_file_name, header = T, stringsAsFactors = F) 
  
  if("refAllele" %in% names(snp_annot)) {snp_annot <- snp_annot %>% rename(ref_vcf = refAllele)}
  if("effectAllele" %in% names(snp_annot)) {snp_annot <- snp_annot %>% rename(alt_vcf = effectAllele)}

  snp_annot <- snp_annot %>%
    filter(!((ref_vcf == 'A' & alt_vcf == 'T') |
               (ref_vcf == 'T' & alt_vcf == 'A') |
               (ref_vcf == 'C' & alt_vcf == 'G') |
               (ref_vcf == 'G' & alt_vcf == 'C')) &
             !(is.na(rsid))) %>%
    distinct(varID, .keep_all = TRUE)
  snp_annot
}


get_maf_filtered_genotype <- function(genotype_file_name,  maf, samples) {
  gt_df <- read.table(genotype_file_name, header = T, stringsAsFactors = F, row.names = 1)
  gt_df <- gt_df[,(colnames(gt_df) %in% samples )] %>% t() %>% as.data.frame()
  effect_allele_freqs <- colMeans(gt_df) / 2
  gt_df <- gt_df[,which((effect_allele_freqs >= maf) & (effect_allele_freqs <= 1 - maf))]
  gt_df
}

get_gene_annotation <- function(gene_annot_file_name, chrom, gene_types=c('protein_coding', 'pseudogene', 'lincRNA')){
  gene_df <- read.table(gene_annot_file_name, header = TRUE, stringsAsFactors = FALSE) %>%
    filter((chr == chrom) & gene_type %in% gene_types) %>% 
    mutate(gene_id = file_path_sans_ext(gene_id))  # remove gene ver from gene names
  gene_df
}

get_gene_type <- function(gene_annot, gene) {
  gene_annot <- gene_annot %>% mutate(gene_id = file_path_sans_ext(gene_id))
  filter(gene_annot, gene_id == gene)$gene_type
}

get_gene_expression <- function(gene_expression_file_name, gene_annot) {
  expr_df <- as.data.frame(t(read.table(gene_expression_file_name, header = T, stringsAsFactors = F, row.names = 1)))
  expr_df <- expr_df %>% t() %>% as.data.frame()
  colnames(expr_df) <- file_path_sans_ext(colnames(expr_df)) # remove gene version number
  expr_df <- expr_df %>% select(one_of(intersect(gene_annot$gene_id, colnames(expr_df))))
  expr_df
}

get_gene_coords <- function(gene_annot, gene) {
  row <- gene_annot[which(gene_annot$gene_id == gene),]
  c(row$start, row$end)
}

get_cis_genotype <- function(gt_df, snp_annot, coords, cis_window) {
  snp_info <- snp_annot %>% filter((pos >= (coords[1] - cis_window) & !is.na(rsid)) & (pos <= (coords[2] + cis_window)))
  if (nrow(snp_info) == 0)
    return(NA)
  #Check of the varID exist in the data
  if (TRUE %in% (snp_info$varID %in% names(gt_df))) {
    cis_gt <- gt_df %>% select(one_of(intersect(snp_info$varID, colnames(gt_df))))
  } else {
    return(NA) # the varID doesn't exist in the gt_df dataset
  }
  column_labels <- colnames(cis_gt)
  row_labels <- rownames(cis_gt)
  # Convert cis_gt to a matrix for glmnet
  cis_gt <- matrix(as.matrix(cis_gt), ncol=ncol(cis_gt)) # R is such a bad language.
  colnames(cis_gt) <- column_labels
  rownames(cis_gt) <- row_labels
  cis_gt
}

get_covariates <- function(covariate_file_name, samples) {
  cov_df <- read.table(covariate_file_name, header = TRUE, stringsAsFactors = FALSE, row.names = 1)
  cov_df <- cov_df[,(names(cov_df) %in% samples )] %>% t() %>% as.data.frame()
  cov_df
}

adjust_for_covariates <- function(expression_vec, cov_df) {
  combined_df <- cbind(expression_vec, cov_df)
  expr_resid <- summary(lm(expression_vec ~ ., data=combined_df))$residuals
  expr_resid <- scale(expr_resid, center = TRUE, scale = TRUE)
  expr_resid
}

do_elastic_net <- function(cis_gt, expr_adj, n_folds, cv_fold_ids, n_times, alpha) {
  #tryCatch({
    cis_gt <- as.matrix(cis_gt)
    fit <- cv.glmnet(cis_gt, expr_adj, nfolds = n_folds, alpha = alpha, keep = TRUE, type.measure='mse', foldid = cv_fold_ids[,1], parallel = FALSE)
    lambda_seq <- fit$lambda
    cvms <- matrix(nrow=length(lambda_seq), ncol=n_times)
    fits <- list()
    fits[[1]] <- fit
    cvms <- matrix(nrow = 100, ncol = n_times)
    cvms[1:length(fit$cvm),1] <- fit$cvm
    for (i in 2:(n_times)) {
      fit <- cv.glmnet(cis_gt, expr_adj, lambda = lambda_seq, nfolds = n_folds, alpha = alpha, keep = FALSE, foldid = cv_fold_ids[,i], parallel = FALSE)
      fits[[i]] <- fit
      cvms[1:length(fit$cvm),i] <- fit$cvm
    }
    avg_cvm <- rowMeans(cvms)
    best_lam_ind <- which.min(avg_cvm)
    best_lambda <- lambda_seq[best_lam_ind]
    out <- list(cv_fit = fits[[1]], min_avg_cvm = min(avg_cvm, na.rm = T), best_lam_ind = best_lam_ind, best_lambda = best_lambda)
  #  out
  #},error = function(cond) {
  #  message('Error')
  #  message(geterrmessage())
  #  out <- list()
  #  out
  #}
  #)
  out
}

evaluate_performance <- function(cis_gt, expr_adj, fit, best_lam_ind, best_lambda, cv_fold_ids, n_folds) {
  n_nonzero <- fit$nzero[best_lam_ind]
  print(glue("Num-nonzero: {n_nonzero}"))
  if (n_nonzero > 0) {
    R2 <- rep(0, n_folds)
    corr_folds <- rep(0, n_folds)
    pval_folds <- rep(0, n_folds)
    zscore_folds <- rep(0, n_folds)

    for (j in (1:n_folds)) {
      fold_idxs <- which(cv_fold_ids[,1] == j)
      fitted_y <- fit$fit.preval[, best_lam_ind]

      tss <- sum(expr_adj[fold_idxs]**2)
      rss <- sum((expr_adj[fold_idxs] - fitted_y[fold_idxs])**2)
      R2[j] <- 1 - (rss/tss)

      corr_folds[j] <- ifelse(sd(fitted_y[fold_idxs]) != 0,
                       cor(expr_adj[fold_idxs], fitted_y[fold_idxs]), 0)
      pval_folds[j] <- ifelse(sd(fitted_y[fold_idxs]) != 0 & length(expr_adj[fold_idxs]) > 3,
                        cor.test(expr_adj[fold_idxs],fitted_y[fold_idxs])$p.value, 1)

      zscore_folds[j] <- atanh(corr_folds[j]) * sqrt(length(expr_adj[fold_idxs]) - 3) # Fisher transformation
    }

    best_fit <- fit$glmnet.fit
    expr_adj_pred <- predict(best_fit, as.matrix(cis_gt), s = best_lambda)
    tss <- sum(expr_adj**2)
    rss <- sum((expr_adj - expr_adj_pred)**2)
    
    n_samp <- length(expr_adj)
    #f_stat <- ((tss - rss) / n_nonzero) / (rss / (n_samp - ncol(cis_gt) - 1))
    #p_val <- pf(f_stat, n_samp - 1, n_samp - ncol(cis_gt) - 1, lower.tail = FALSE)
    weights <- best_fit$beta[which(best_fit$beta[,best_lam_ind] != 0), best_lam_ind]
    weighted_snps <- names(best_fit$beta[,best_lam_ind])[which(best_fit$beta[,best_lam_ind] != 0)]
    R2_mean <- mean(R2)
    R2_sd <- sd(R2)
    inR2 <- 1 - (rss/tss)
    rho_avg <- mean(corr_folds)
    rho_se <- sd(corr_folds)
    rho_avg_squared <- rho_avg**2

    # Stouffer's method for combining z scores.
    zscore_est <- sum(zscore_folds) / sqrt(n_folds)
    zscore_pval <- 2*pnorm(abs(zscore_est), lower.tail = FALSE)
    # Fisher's method for combining p-values: https://en.wikipedia.org/wiki/Fisher%27s_method
    pval_est <- pchisq(-2 * sum(log(pval_folds)), 2*n_folds, lower.tail = F)
    # Old way
    pred_perf <- summary(lm(expr_adj ~ fit$fit.preval[,best_lam_ind]))
    pred_perf_rsq <- pred_perf$r.squared
    
    #one_sided_pval <- cor.test(expr_adj, fit$fit.preval[,best_lam_ind], alternative = 'greater')$p.value
    out <- list(weights = weights, n_weights = n_nonzero, weighted_snps = weighted_snps, R2_mean = R2_mean, R2_sd = R2_sd,
                inR2 = inR2, pval_est=pval_est, rho_avg=rho_avg, rho_se=rho_se, 
                rho_zscore=zscore_est, rho_avg_squared=rho_avg_squared, zscore_pval=zscore_pval)
  } else {
    out <- list(weights = NA, n_weights = n_nonzero, weighted_snps = NA, R2_mean = NA, R2_sd = NA,
                inR2 = NA, pval_est = NA, rho_avg = NA, rho_se= NA, 
                rho_zscore= NA, rho_avg_squared= NA, zscore_pval= NA)
  }
  out
}

do_covariance <- function(gene_id, cis_gt, rsids, varIDs, out_file) {
  model_gt <- cis_gt[,varIDs, drop=FALSE]
  geno_cov <- cov(model_gt)
  #print(dim(geno_cov))
  cov_df <- data.frame(gene=character(),rsid1=character(),rsid2=character(), covariance=double())
  for (i in 1:length(rsids)) {
    for (j in i:length(rsids)) {
      #print(c(i, j))
      cov_df <- tryCatch(rbind(cov_df, data.frame(gene=gene_id,rsid1=rsids[i], rsid2=rsids[j], covariance=geno_cov[i,j])),
                         error = function(cond) browser())
    }
  }
  write.table(cov_df, file = out_file, append = TRUE, quote = FALSE, col.names = FALSE, row.names = FALSE, sep = " ")
}

Set paths

Code
# suppressMessages(library(glue))

# chrom <- 1
# snp_annot_file <- glue("/beagle3/haky/users/saideep/projects/aracena_modeling/linearization/standard_predictDB_aracena_Flu_singlenested_3_7_lnormTRUE_beforeTRUE_scale1/work/f4/d22672a23ca1dffefd3d786b20333b/snp_file")
# gene_annot_file <- "/beagle3/haky/users/saideep/projects/aracena_modeling/linearization/standard_predictDB_aracena_Flu_singlenested_3_7_lnormTRUE_beforeTRUE_scale1/work/f4/d22672a23ca1dffefd3d786b20333b/gene_annot.parsed.txt"
# genotype_file <- glue("/beagle3/haky/users/saideep/projects/aracena_modeling/linearization/standard_predictDB_aracena_Flu_singlenested_3_7_lnormTRUE_beforeTRUE_scale1/work/f4/d22672a23ca1dffefd3d786b20333b/genotype_file")
# expression_file <- "/beagle3/haky/users/saideep/projects/aracena_modeling/linearization/standard_predictDB_aracena_Flu_singlenested_3_7_lnormTRUE_beforeTRUE_scale1/work/f4/d22672a23ca1dffefd3d786b20333b/transposed_gene_exp.txt"
# covariates_file <- NULL
# n_k_folds <- 10


# suppressMessages(library(dplyr))
# suppressMessages(library(glmnet))
# suppressMessages((library(reshape2)))
# suppressMessages(library(methods))
# suppressMessages(library(tools))
# "%&%" <- function(a,b) paste(a,b, sep = "")

Test MAIN

read gene get_gene_annotation

Code
# gene_annot <- get_gene_annotation(gene_annot_file, chrom)
# expr_df <- get_gene_expression(expression_file, gene_annot)
# samples <- rownames(expr_df)
# n_samples <- length(samples)
# genes <- colnames(expr_df)
# n_genes <- length(expr_df)
# snp_annot <- get_filtered_snp_annot(snp_annot_file)

# maf=0.01
# gt_df <- get_maf_filtered_genotype(genotype_file, maf, samples)

# print(dim(gene_annot))
# print(dim(expr_df))
# print(dim(gt_df))
# print(dim(snp_annot))

# print(intersect(rownames(gt_df), rownames(expr_df)))
# print(length(intersect(rownames(gt_df), rownames(expr_df))))
# prefix <- "/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2024-02-26-write_predict_db_checks"

# seed <- 1
# n_times <- 3

# cis_window <- 1e6
# alpha <- 0.5

Set seed and cv fold_idxs

Code
# # Set seed and cross-validation fold ids----
# seed <- ifelse(is.na(seed), sample(1:1000000, 1), seed)
# set.seed(seed)
# cv_fold_ids <- matrix(nrow = n_samples, ncol = n_times)
# for (j in 1:n_times)
#   cv_fold_ids[,j] <- sample(1:n_k_folds, n_samples, replace = TRUE)

Start training models

Code
# # Prepare output data----

# for (i in 1:5) {
#     cat(i, "/", n_genes, "\n")
#     gene <- genes[i]
#     gene_name <- gene_annot$gene_name[gene_annot$gene_id == gene]
#     gene_type <- get_gene_type(gene_annot, gene)
#     coords <- get_gene_coords(gene_annot, gene)
#     cis_gt <- get_cis_genotype(gt_df, snp_annot, coords, cis_window)


#     if (ncol(cis_gt) >= 2) {
#       expression_vec <- expr_df[,i]
#       if (is.null(NULL)) {
#         adj_expression <- as.matrix(expression_vec)
#         rownames(adj_expression) <- rownames(expr_df)
#       } else {
#         adj_expression <- adjust_for_covariates(expression_vec, covariates_df)
#       }
#       # Filter out samples that are in both expression and genotype data.
#       adj_expression <- as.matrix(adj_expression[(rownames(adj_expression) %in% rownames(cis_gt)),])
#       # sort row names to be in the same order for X and y
#       cis_gt = cis_gt[match(rownames(adj_expression), rownames(cis_gt)),]

#       elnet_out <- do_elastic_net(cis_gt, adj_expression, n_k_folds, cv_fold_ids, n_times, alpha)
#       if (length(elnet_out) > 0) {
#         eval <- evaluate_performance(cis_gt, adj_expression, elnet_out$cv_fit, 
#         elnet_out$best_lam_ind, elnet_out$best_lambda, cv_fold_ids, n_k_folds)
#         model_summary <- c(gene, gene_name, gene_type, alpha, ncol(cis_gt),           elnet_out$min_avg_cvm, elnet_out$best_lam_ind,
#                            elnet_out$best_lambda, eval$n_weights, eval$R2_mean, eval$R2_sd, eval$inR2,
#                            eval$pval_est, eval$rho_avg, eval$rho_se, eval$rho_zscore, eval$rho_avg_squared, eval$zscore_pval)
#         print(model_summary)
#         print(eval)
#         print(length(model_summary))
#     }
#   }
# }

Prechecks

Prechecks should be:

1.) Gene overlap between expression data and gene annotations should be high relative to the number of genes in the expression data.

2.) Sample overlap between expression data and genotype data should be high relative to the number of samples in the expression data.

3.) SNP overlap between genotype data and SNP annotations should be high relative to the number of SNPs in the genotype data.

4.) Formatting of each file should match documentation

Code
# summ <- read.table("/beagle3/haky/users/saideep/projects/aracena_modeling/linearization/standard_predictDB_aracena_Flu_singlenested_3_7_lnormTRUE_beforeTRUE_scale1/work/f4/d22672a23ca1dffefd3d786b20333b/summary/Model_training_chr1_model_summaries.txt")
Code
# summ