---
title: "write_predict_db_checks"
author: "Saideep Gona"
date: "2024-02-26"
format:
html:
code-fold: true
code-summary: "Show the code"
execute:
freeze: true
warning: false
---
# Context
We have been having some difficulties running Predict-DB, so here I am going to debug my latest issues and make a more formal set of pre-checks everyone can use.
### Functions
```{r}
get_filtered_snp_annot <- function (snp_annot_file_name) {
snp_annot <- read.table (snp_annot_file_name, header = T, stringsAsFactors = F)
if ("refAllele" %in% names (snp_annot)) {snp_annot <- snp_annot %>% rename (ref_vcf = refAllele)}
if ("effectAllele" %in% names (snp_annot)) {snp_annot <- snp_annot %>% rename (alt_vcf = effectAllele)}
snp_annot <- snp_annot %>%
filter (! ((ref_vcf == 'A' & alt_vcf == 'T' ) |
(ref_vcf == 'T' & alt_vcf == 'A' ) |
(ref_vcf == 'C' & alt_vcf == 'G' ) |
(ref_vcf == 'G' & alt_vcf == 'C' )) &
! (is.na (rsid))) %>%
distinct (varID, .keep_all = TRUE )
snp_annot
}
get_maf_filtered_genotype <- function (genotype_file_name, maf, samples) {
gt_df <- read.table (genotype_file_name, header = T, stringsAsFactors = F, row.names = 1 )
gt_df <- gt_df[,(colnames (gt_df) %in% samples )] %>% t () %>% as.data.frame ()
effect_allele_freqs <- colMeans (gt_df) / 2
gt_df <- gt_df[,which ((effect_allele_freqs >= maf) & (effect_allele_freqs <= 1 - maf))]
gt_df
}
get_gene_annotation <- function (gene_annot_file_name, chrom, gene_types= c ('protein_coding' , 'pseudogene' , 'lincRNA' )){
gene_df <- read.table (gene_annot_file_name, header = TRUE , stringsAsFactors = FALSE ) %>%
filter ((chr == chrom) & gene_type %in% gene_types) %>%
mutate (gene_id = file_path_sans_ext (gene_id)) # remove gene ver from gene names
gene_df
}
get_gene_type <- function (gene_annot, gene) {
gene_annot <- gene_annot %>% mutate (gene_id = file_path_sans_ext (gene_id))
filter (gene_annot, gene_id == gene)$ gene_type
}
get_gene_expression <- function (gene_expression_file_name, gene_annot) {
expr_df <- as.data.frame (t (read.table (gene_expression_file_name, header = T, stringsAsFactors = F, row.names = 1 )))
expr_df <- expr_df %>% t () %>% as.data.frame ()
colnames (expr_df) <- file_path_sans_ext (colnames (expr_df)) # remove gene version number
expr_df <- expr_df %>% select (one_of (intersect (gene_annot$ gene_id, colnames (expr_df))))
expr_df
}
get_gene_coords <- function (gene_annot, gene) {
row <- gene_annot[which (gene_annot$ gene_id == gene),]
c (row$ start, row$ end)
}
get_cis_genotype <- function (gt_df, snp_annot, coords, cis_window) {
snp_info <- snp_annot %>% filter ((pos >= (coords[1 ] - cis_window) & ! is.na (rsid)) & (pos <= (coords[2 ] + cis_window)))
if (nrow (snp_info) == 0 )
return (NA )
#Check of the varID exist in the data
if (TRUE %in% (snp_info$ varID %in% names (gt_df))) {
cis_gt <- gt_df %>% select (one_of (intersect (snp_info$ varID, colnames (gt_df))))
} else {
return (NA ) # the varID doesn't exist in the gt_df dataset
}
column_labels <- colnames (cis_gt)
row_labels <- rownames (cis_gt)
# Convert cis_gt to a matrix for glmnet
cis_gt <- matrix (as.matrix (cis_gt), ncol= ncol (cis_gt)) # R is such a bad language.
colnames (cis_gt) <- column_labels
rownames (cis_gt) <- row_labels
cis_gt
}
get_covariates <- function (covariate_file_name, samples) {
cov_df <- read.table (covariate_file_name, header = TRUE , stringsAsFactors = FALSE , row.names = 1 )
cov_df <- cov_df[,(names (cov_df) %in% samples )] %>% t () %>% as.data.frame ()
cov_df
}
adjust_for_covariates <- function (expression_vec, cov_df) {
combined_df <- cbind (expression_vec, cov_df)
expr_resid <- summary (lm (expression_vec ~ ., data= combined_df))$ residuals
expr_resid <- scale (expr_resid, center = TRUE , scale = TRUE )
expr_resid
}
do_elastic_net <- function (cis_gt, expr_adj, n_folds, cv_fold_ids, n_times, alpha) {
#tryCatch({
cis_gt <- as.matrix (cis_gt)
fit <- cv.glmnet (cis_gt, expr_adj, nfolds = n_folds, alpha = alpha, keep = TRUE , type.measure= 'mse' , foldid = cv_fold_ids[,1 ], parallel = FALSE )
lambda_seq <- fit$ lambda
cvms <- matrix (nrow= length (lambda_seq), ncol= n_times)
fits <- list ()
fits[[1 ]] <- fit
cvms <- matrix (nrow = 100 , ncol = n_times)
cvms[1 : length (fit$ cvm),1 ] <- fit$ cvm
for (i in 2 : (n_times)) {
fit <- cv.glmnet (cis_gt, expr_adj, lambda = lambda_seq, nfolds = n_folds, alpha = alpha, keep = FALSE , foldid = cv_fold_ids[,i], parallel = FALSE )
fits[[i]] <- fit
cvms[1 : length (fit$ cvm),i] <- fit$ cvm
}
avg_cvm <- rowMeans (cvms)
best_lam_ind <- which.min (avg_cvm)
best_lambda <- lambda_seq[best_lam_ind]
out <- list (cv_fit = fits[[1 ]], min_avg_cvm = min (avg_cvm, na.rm = T), best_lam_ind = best_lam_ind, best_lambda = best_lambda)
# out
#},error = function(cond) {
# message('Error')
# message(geterrmessage())
# out <- list()
# out
#}
#)
out
}
evaluate_performance <- function (cis_gt, expr_adj, fit, best_lam_ind, best_lambda, cv_fold_ids, n_folds) {
n_nonzero <- fit$ nzero[best_lam_ind]
print (glue ("Num-nonzero: {n_nonzero}" ))
if (n_nonzero > 0 ) {
R2 <- rep (0 , n_folds)
corr_folds <- rep (0 , n_folds)
pval_folds <- rep (0 , n_folds)
zscore_folds <- rep (0 , n_folds)
for (j in (1 : n_folds)) {
fold_idxs <- which (cv_fold_ids[,1 ] == j)
fitted_y <- fit$ fit.preval[, best_lam_ind]
tss <- sum (expr_adj[fold_idxs]** 2 )
rss <- sum ((expr_adj[fold_idxs] - fitted_y[fold_idxs])** 2 )
R2[j] <- 1 - (rss/ tss)
corr_folds[j] <- ifelse (sd (fitted_y[fold_idxs]) != 0 ,
cor (expr_adj[fold_idxs], fitted_y[fold_idxs]), 0 )
pval_folds[j] <- ifelse (sd (fitted_y[fold_idxs]) != 0 & length (expr_adj[fold_idxs]) > 3 ,
cor.test (expr_adj[fold_idxs],fitted_y[fold_idxs])$ p.value, 1 )
zscore_folds[j] <- atanh (corr_folds[j]) * sqrt (length (expr_adj[fold_idxs]) - 3 ) # Fisher transformation
}
best_fit <- fit$ glmnet.fit
expr_adj_pred <- predict (best_fit, as.matrix (cis_gt), s = best_lambda)
tss <- sum (expr_adj** 2 )
rss <- sum ((expr_adj - expr_adj_pred)** 2 )
n_samp <- length (expr_adj)
#f_stat <- ((tss - rss) / n_nonzero) / (rss / (n_samp - ncol(cis_gt) - 1))
#p_val <- pf(f_stat, n_samp - 1, n_samp - ncol(cis_gt) - 1, lower.tail = FALSE)
weights <- best_fit$ beta[which (best_fit$ beta[,best_lam_ind] != 0 ), best_lam_ind]
weighted_snps <- names (best_fit$ beta[,best_lam_ind])[which (best_fit$ beta[,best_lam_ind] != 0 )]
R2_mean <- mean (R2)
R2_sd <- sd (R2)
inR2 <- 1 - (rss/ tss)
rho_avg <- mean (corr_folds)
rho_se <- sd (corr_folds)
rho_avg_squared <- rho_avg** 2
# Stouffer's method for combining z scores.
zscore_est <- sum (zscore_folds) / sqrt (n_folds)
zscore_pval <- 2 * pnorm (abs (zscore_est), lower.tail = FALSE )
# Fisher's method for combining p-values: https://en.wikipedia.org/wiki/Fisher%27s_method
pval_est <- pchisq (- 2 * sum (log (pval_folds)), 2 * n_folds, lower.tail = F)
# Old way
pred_perf <- summary (lm (expr_adj ~ fit$ fit.preval[,best_lam_ind]))
pred_perf_rsq <- pred_perf$ r.squared
#one_sided_pval <- cor.test(expr_adj, fit$fit.preval[,best_lam_ind], alternative = 'greater')$p.value
out <- list (weights = weights, n_weights = n_nonzero, weighted_snps = weighted_snps, R2_mean = R2_mean, R2_sd = R2_sd,
inR2 = inR2, pval_est= pval_est, rho_avg= rho_avg, rho_se= rho_se,
rho_zscore= zscore_est, rho_avg_squared= rho_avg_squared, zscore_pval= zscore_pval)
} else {
out <- list (weights = NA , n_weights = n_nonzero, weighted_snps = NA , R2_mean = NA , R2_sd = NA ,
inR2 = NA , pval_est = NA , rho_avg = NA , rho_se= NA ,
rho_zscore= NA , rho_avg_squared= NA , zscore_pval= NA )
}
out
}
do_covariance <- function (gene_id, cis_gt, rsids, varIDs, out_file) {
model_gt <- cis_gt[,varIDs, drop= FALSE ]
geno_cov <- cov (model_gt)
#print(dim(geno_cov))
cov_df <- data.frame (gene= character (),rsid1= character (),rsid2= character (), covariance= double ())
for (i in 1 : length (rsids)) {
for (j in i: length (rsids)) {
#print(c(i, j))
cov_df <- tryCatch (rbind (cov_df, data.frame (gene= gene_id,rsid1= rsids[i], rsid2= rsids[j], covariance= geno_cov[i,j])),
error = function (cond) browser ())
}
}
write.table (cov_df, file = out_file, append = TRUE , quote = FALSE , col.names = FALSE , row.names = FALSE , sep = " " )
}
```
### Set paths
```{r}
# suppressMessages(library(glue))
# chrom <- 1
# snp_annot_file <- glue("/beagle3/haky/users/saideep/projects/aracena_modeling/linearization/standard_predictDB_aracena_Flu_singlenested_3_7_lnormTRUE_beforeTRUE_scale1/work/f4/d22672a23ca1dffefd3d786b20333b/snp_file")
# gene_annot_file <- "/beagle3/haky/users/saideep/projects/aracena_modeling/linearization/standard_predictDB_aracena_Flu_singlenested_3_7_lnormTRUE_beforeTRUE_scale1/work/f4/d22672a23ca1dffefd3d786b20333b/gene_annot.parsed.txt"
# genotype_file <- glue("/beagle3/haky/users/saideep/projects/aracena_modeling/linearization/standard_predictDB_aracena_Flu_singlenested_3_7_lnormTRUE_beforeTRUE_scale1/work/f4/d22672a23ca1dffefd3d786b20333b/genotype_file")
# expression_file <- "/beagle3/haky/users/saideep/projects/aracena_modeling/linearization/standard_predictDB_aracena_Flu_singlenested_3_7_lnormTRUE_beforeTRUE_scale1/work/f4/d22672a23ca1dffefd3d786b20333b/transposed_gene_exp.txt"
# covariates_file <- NULL
# n_k_folds <- 10
# suppressMessages(library(dplyr))
# suppressMessages(library(glmnet))
# suppressMessages((library(reshape2)))
# suppressMessages(library(methods))
# suppressMessages(library(tools))
# "%&%" <- function(a,b) paste(a,b, sep = "")
```
## Test MAIN
### read gene get_gene_annotation
```{r}
# gene_annot <- get_gene_annotation(gene_annot_file, chrom)
# expr_df <- get_gene_expression(expression_file, gene_annot)
# samples <- rownames(expr_df)
# n_samples <- length(samples)
# genes <- colnames(expr_df)
# n_genes <- length(expr_df)
# snp_annot <- get_filtered_snp_annot(snp_annot_file)
# maf=0.01
# gt_df <- get_maf_filtered_genotype(genotype_file, maf, samples)
# print(dim(gene_annot))
# print(dim(expr_df))
# print(dim(gt_df))
# print(dim(snp_annot))
# print(intersect(rownames(gt_df), rownames(expr_df)))
# print(length(intersect(rownames(gt_df), rownames(expr_df))))
# prefix <- "/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2024-02-26-write_predict_db_checks"
# seed <- 1
# n_times <- 3
# cis_window <- 1e6
# alpha <- 0.5
```
### Set seed and cv fold_idxs
```{r}
# # Set seed and cross-validation fold ids----
# seed <- ifelse(is.na(seed), sample(1:1000000, 1), seed)
# set.seed(seed)
# cv_fold_ids <- matrix(nrow = n_samples, ncol = n_times)
# for (j in 1:n_times)
# cv_fold_ids[,j] <- sample(1:n_k_folds, n_samples, replace = TRUE)
```
### Start training models
```{r}
# # Prepare output data----
# for (i in 1:5) {
# cat(i, "/", n_genes, "\n")
# gene <- genes[i]
# gene_name <- gene_annot$gene_name[gene_annot$gene_id == gene]
# gene_type <- get_gene_type(gene_annot, gene)
# coords <- get_gene_coords(gene_annot, gene)
# cis_gt <- get_cis_genotype(gt_df, snp_annot, coords, cis_window)
# if (ncol(cis_gt) >= 2) {
# expression_vec <- expr_df[,i]
# if (is.null(NULL)) {
# adj_expression <- as.matrix(expression_vec)
# rownames(adj_expression) <- rownames(expr_df)
# } else {
# adj_expression <- adjust_for_covariates(expression_vec, covariates_df)
# }
# # Filter out samples that are in both expression and genotype data.
# adj_expression <- as.matrix(adj_expression[(rownames(adj_expression) %in% rownames(cis_gt)),])
# # sort row names to be in the same order for X and y
# cis_gt = cis_gt[match(rownames(adj_expression), rownames(cis_gt)),]
# elnet_out <- do_elastic_net(cis_gt, adj_expression, n_k_folds, cv_fold_ids, n_times, alpha)
# if (length(elnet_out) > 0) {
# eval <- evaluate_performance(cis_gt, adj_expression, elnet_out$cv_fit,
# elnet_out$best_lam_ind, elnet_out$best_lambda, cv_fold_ids, n_k_folds)
# model_summary <- c(gene, gene_name, gene_type, alpha, ncol(cis_gt), elnet_out$min_avg_cvm, elnet_out$best_lam_ind,
# elnet_out$best_lambda, eval$n_weights, eval$R2_mean, eval$R2_sd, eval$inR2,
# eval$pval_est, eval$rho_avg, eval$rho_se, eval$rho_zscore, eval$rho_avg_squared, eval$zscore_pval)
# print(model_summary)
# print(eval)
# print(length(model_summary))
# }
# }
# }
```
## Prechecks
Prechecks should be:
1.) Gene overlap between expression data and gene annotations should be high relative to the number of genes in the expression data.
2.) Sample overlap between expression data and genotype data should be high relative to the number of samples in the expression data.
3.) SNP overlap between genotype data and SNP annotations should be high relative to the number of SNPs in the genotype data.
4.) Formatting of each file should match documentation
```{r}
# summ <- read.table("/beagle3/haky/users/saideep/projects/aracena_modeling/linearization/standard_predictDB_aracena_Flu_singlenested_3_7_lnormTRUE_beforeTRUE_scale1/work/f4/d22672a23ca1dffefd3d786b20333b/summary/Model_training_chr1_model_summaries.txt")
```
```{r}
# summ
```