Code
library(edgeR)
library(glue)
library(tidyverse)
## Load libraries
library(preprocessCore)
library(limma)
library(edgeR)
library(statmod)
library(sva)
library(reshape2)
library(dplyr)
library(stringr)
library(SNPRelate)
Saideep Gona
November 27, 2023
It is important when running S-PrediXcan type analyses to properly normalize the expression data. Thus far, I have employed simple log normalization, but in reality it makes more downstream sense to normalize akin to GTEx because PrediXcan itself was developed using GTEx data and these methods are consistent with the state of the art.
According to GTEx (https://www.nature.com/articles/nature24277#Sec11) the normalization steps are as follows:
In addition, covariates were calculated as follows:
This offers a general guideline for performing the normalization up to standards. However, in the Aracena paper empirical observation of the data led to some differing choices, and the authors also performed a more thorough analysis of the covariates and slightly differing normalization techniques. They introduced the following additional normalization steps:
In terms of convariate correction:
They did not normalize by exon length because it was not necessary for their analysis, and they did not inverse quantile normalize. For the purpose of training the DL model, it is important to normalize this way. Also, peer factors are clunky and support has reduced for their use. Therefore, my final normalization scheme is:
and convariate correction:
We need the genes to also be length normalized, otherwise we are predicting arbitrary units of expression based purely on promoter epigenetic profile.
# wget "ftp://ftp.ensembl.org/pub/release-84/gtf/homo_sapiens/Homo_sapiens.GRCh38.84.gtf.gz"
# zcat Homo_sapiens.GRCh38.84.gtf.gz | awk '{if($3=="exon") print $10"\t"$5-$4}' | sed -e 's/"//g' -e 's/;//' | /beagle3/haky/users/saideep/software/bedtools2/bin/bedtools groupby -i - -g 1 -c 2 -o sum > Exon_lengths.txt
exon_lengths_file <- "/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-11-27-normalize_expression_GTEx/Exon_lengths.txt"
exon_lengths <- as.data.frame(read.table(exon_lengths_file, row.names=1))
exon_lengths$genes <- rownames(exon_lengths)
geneset <- rownames(expression_mat)[rownames(expression_mat) %in% rownames(exon_lengths)]
expression_subset <- expression_mat[geneset,]
exon_lengths_subset <- exon_lengths[geneset,]
length_norm_expression <- as.data.frame(as.matrix(expression_subset)/as.numeric(exon_lengths_subset[,1]))
gene_annotation_file <- "/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-11-21-aracena_predixcan/canonical_TSS_full_metadata.txt"
gene_annotation_table <- read.delim(gene_annotation_file, header = TRUE, sep=",")
png("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-11-27-normalize_expression_GTEx/length.png", width = 1000, height = 1000)
boxplot(log2(length_norm_expression))
dev.off()
svg
2
# Set some functions
# This function mean centers the data.
mean_center_clean=function(cols_local,column){
index=which(colnames(cols_local)==column)
cols_local[,index]=cols_local[,index]-mean(cols_local[,index])
return(cols_local)
}
# This function applies mean centers function to desired metadata column. Here we apply it to age.
pretty_up_cols=function(cols){
cols=refactorize(cols)
cols=mean_center_clean(cols,"Age")
return(cols)
}
# This function makes sure that desired metadata columns are factors.
refactorize=function(cols_local){
cols_local$Genotyping_ID=factor(cols_local$Genotyping_ID,levels=unique(as.character(cols_local$Genotyping_ID)))
cols_local$Sample=factor(cols_local$Sample)
cols_local$Condition=factor(cols_local$Condition)
cols_local$Batch=factor(cols_local$Batch)
return(cols_local)
}
# Generate genotype PC and update metadata
gtypes = read.table(paste0("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/QTL_mapping/SNP_genotypes.txt"),header = TRUE, stringsAsFactors = FALSE)
samples=colnames(gtypes)
gtypes_pca=data.frame(snp_id=rownames(gtypes),gtypes)
snpgdsCreateGeno(paste0("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/QTL_mapping", "/GDS_genotypes.gds"),
genmat = as.matrix(gtypes_pca[, samples]),
sample.id = unique(samples),
snp.id = gtypes_pca$snp_id,
snpfirstdim=TRUE)
## This command tells you the total number of samples and SNPs in the .gds file.
snpgdsSummary(paste0("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/QTL_mapping", "/GDS_genotypes.gds"))
The file name: /beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/QTL_mapping/GDS_genotypes.gds
The total number of samples: 35
The total number of SNPs: 7383243
SNP genotypes are stored in individual-major mode (SNP X Sample).
Principal Component Analysis (PCA) on genotypes:
Excluding 0 SNP on non-autosomes
Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
# of samples: 35
# of SNPs: 7,383,243
using 1 thread
# of principal components: 32
PCA: the sum of all selected genotypes (0,1,2) = 112546328
CPU capabilities: Double-Precision SSE2
Tue Dec 5 10:43:56 2023 (internal increment: 130016)
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed, 5s
Tue Dec 5 10:44:01 2023 Begin (eigenvalues and eigenvectors)
Tue Dec 5 10:44:01 2023 Done.
## Subset the first PC
tab <- data.frame(sample.id = pca$sample.id,
PC1 = pca$eigenvect[,1], # the first eigenvector
stringsAsFactors = FALSE)
samples_with_gtype_pc <- c(paste0(tab$sample.id, "_Flu"),paste0(tab$sample.id, "_NI"))
gtype_pc <- c(tab$PC1,tab$PC1)
# Include only samples with genotype PCs
meta_data_with_gtype_pc <- meta_data[samples_with_gtype_pc,]
dim(meta_data_with_gtype_pc)
[1] 70 16
[1] 13716 70
[1] 70 17
########################################
### Format reads matrices & metadata ###
########################################
#subset to only include those samples for which there is a 1 in the "PopDEset" & "EQTL_set" column.
meta_data=meta_data[which(meta_data$PopDE_set==1 | meta_data$EQTL_set==1),]
#ensure that the Sample_ID are the rownames of the meta_data dataframe
rownames(meta_data)=meta_data$Sample
#ensure that the meta_data dataframe is ordered alphabetically (based on sample_id)
meta_data=meta_data[order(rownames(meta_data)),]
#make sure that all the columns of the meta data are also in the read counts matrix.
reads=reads[,which(colnames(reads) %in% rownames(meta_data))]
#order the reads count matrix.
reads=reads[,order(colnames(reads))]
#this should be 0. It is a check to ensure that all the metadata and read counts matrix have all the same samples.
length(which(rownames(meta_data)!=colnames(reads)))
[1] 0
[1] TRUE
# mean center age
meta_data=pretty_up_cols(meta_data)
### Get condition-specific reads matrices
##subset the NI samples only)
reads_NI=reads[,which(meta_data$Condition=="NI")]
##subset the Flu samples only)
reads_FLU=reads[,which(meta_data$Condition=="Flu")]
### Get condition-specific metadata tables
meta_data_NI=refactorize(meta_data[which(meta_data$Condition=="NI"),])
meta_data_FLU=refactorize(meta_data[which(meta_data$Condition=="Flu"),])
######################################################
### Voom normalize reads, remove batch with Combat ###
######################################################
COR=function(reads,meta_data){
dge <- DGEList(counts=reads)
dge <- calcNormFactors(dge)
design=model.matrix(~Age+Admixture,data=meta_data)
v <- voom(dge,design,plot=FALSE)
v_combat = ComBat(dat=as.matrix(v$E), batch=meta_data$Batch, mod=design, par.prior=TRUE)
v$E=v_combat
fit <-lmFit(v,design)
fit <- eBayes(fit)
return(list(fit,v))
}
#Correct for batch within each condition.
COR_NI=COR(reads=reads_NI,meta_data=meta_data_NI)
fit_NI=COR_NI[[1]]
v_NI=COR_NI[[2]]
COR_FLU=COR(reads=reads_FLU,meta_data=meta_data_FLU)
fit_FLU=COR_FLU[[1]]
v_FLU=COR_FLU[[2]]
## Regress out effects of non-admixture covariates within condition
correct_exp=function(v,fit){
corrected_exp=v$E
## regress out second column in the design matrix which is age
## the first is intercept and third is admixture
indexes=c(2)
for(i in indexes){
corrected_exp <- corrected_exp - fit$coefficients[,i]%*%t(fit$design[,i])
return(corrected_exp)
}
}
corrected_exp_NI=correct_exp(v_NI,fit_NI)
corrected_exp_FLU=correct_exp(v_FLU,fit_FLU)
## bind corrected read counts back together
all_corrected_exp <- cbind(corrected_exp_NI, corrected_exp_FLU)
png("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-11-27-normalize_expression_GTEx/length_library_batch_cov_1.png", width = 1000, height = 1000)
boxplot(all_corrected_exp)
dev.off()
svg
2
expPCs_reg = 4
###########################################################################
## Build matrixEQTL input: expression tables: regressing out first n PCs ##
###########################################################################
if(expPCs_reg==0){
expression=all_corrected_exp
}else{
## Empirically remove PCs from the phenotype data.
pc_set=c(1:expPCs_reg)
## Regress those out.
pca_rm <- function(input_data, pc_set) {
pca = prcomp(t(input_data), na.action = na.omit)
new = input_data
new = apply(new, 1, FUN = function(x){return(lm(as.numeric(x) ~ -1 + pca$x[, as.numeric(pc_set)])$resid)})
new = t(new)
colnames(new) = colnames(input_data)
rownames(new) = rownames(input_data)
return(new)
}
expression = pca_rm(all_corrected_exp, pc_set)
}
png("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-11-27-normalize_expression_GTEx/length_library_batch_cov_2.png", width = 1000, height = 1000)
boxplot(expression)
dev.off()
svg
2
---
title: "0.1_normalize_expression_GTEx"
author: "Saideep Gona"
date: "2023-11-27"
format:
html:
code-fold: true
code-summary: "Show the code"
execute:
freeze: true
warning: false
---
# Context
It is important when running S-PrediXcan type analyses to properly normalize the expression data. Thus far, I have employed simple log normalization, but in reality it makes more downstream sense to normalize akin to GTEx because PrediXcan itself was developed using GTEx data and these methods are consistent with the state of the art.
According to GTEx (https://www.nature.com/articles/nature24277#Sec11) the normalization steps are as follows:
1. TPM expression
2. TMM normalization using edgeR
3. Inverse normal transformation
In addition, covariates were calculated as follows:
1. Top 5 genotyping PCs
2. 15 Peer factors for samples size less than 150
3. Other covariates
This offers a general guideline for performing the normalization up to standards. However, in the Aracena paper empirical observation of the data led to some differing choices, and the authors also performed a more thorough analysis of the covariates and slightly differing normalization techniques. They introduced the following additional normalization steps:
1. Voom normalization for mean-variance correction of counts
In terms of convariate correction:
1. Just 1 genotyping PCs (based on observed variance explained across individuals)
2. 4 expression PCs
3. Batch correction using ComBat
4. Regress out age and admixture, etc.
They did not normalize by exon length because it was not necessary for their analysis, and they did not inverse quantile normalize. For the purpose of training the DL model, it is important to normalize this way. Also, peer factors are clunky and support has reduced for their use. Therefore, my final normalization scheme is:
1. Length normalization
2. TMM normalization and TPM calculation using edgeR
3. Batch correction ComBat
4. Voom normalization for mean-variance correction of counts (includes logcpm)
and covariate correction:
1. Just 1 genotyping PCs (based on observed variance explained across individuals)
2. 4 expression PCs
3. Regress out age and admixture, etc.
It turns out the above approach is flawed. The length normalization doesn't appear to mix well with the TMM normalization. Instead, it seems reasonable to use transcript counts estimated with Kallisto directly and see if that works.
```{r}
# install.packages(c("edgeR","glue","tidyverse","preprocessCore","limma","edgeR","statmod","sva","reshape2","dplyr","stringr"))
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install(c("SNPRelate","limma","edgeR","sva","preprocessCore","limma","edgeR","statmod","sva","reshape2","dplyr","stringr"))
```
```{r}
library(edgeR)
library(glue)
library(tidyverse)
## Load libraries
library(preprocessCore)
library(limma)
library(edgeR)
library(statmod)
library(sva)
library(reshape2)
library(dplyr)
library(stringr)
library(SNPRelate)
```
### Length normalize, compute gene exon lengths
We need the genes to also be length normalized, otherwise we are predicting arbitrary units of expression based purely on promoter epigenetic profile.
```{bash}
# wget "ftp://ftp.ensembl.org/pub/release-84/gtf/homo_sapiens/Homo_sapiens.GRCh38.84.gtf.gz"
# zcat Homo_sapiens.GRCh38.84.gtf.gz | awk '{if($3=="exon") print $10"\t"$5-$4}' | sed -e 's/"//g' -e 's/;//' | /beagle3/haky/users/saideep/software/bedtools2/bin/bedtools groupby -i - -g 1 -c 2 -o sum > Exon_lengths.txt
```
```{r}
length_norm_before = FALSE
length_norm_bool = FALSE
scale = 1
expPCs_reg_flu = 3
expPCs_reg_ni = 7
source = "kallisto"
```
```{r}
if (source == "kallisto") {
expression_mat <- read.table("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/kallisto_counts.tsv")
} else {
expression_mat <- read.table("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/RNAseq_filtered.counts.txt")
}
```
```{r}
exon_lengths_file <- "/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-11-27-normalize_expression_GTEx/Exon_lengths.txt"
exon_lengths <- as.data.frame(read.table(exon_lengths_file, row.names=1))
exon_lengths$genes <- rownames(exon_lengths)
geneset <- rownames(expression_mat)[rownames(expression_mat) %in% rownames(exon_lengths)]
expression_subset <- expression_mat[geneset,]
exon_lengths_subset <- exon_lengths[geneset,]
exon_length_gene_names <- rownames(exon_lengths_subset)
exon_lengths <- as.numeric(exon_lengths_subset[,1])/scale
gene_annotation_file <- "/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-11-21-aracena_predixcan/canonical_TSS_full_metadata.txt"
gene_annotation_table <- read.delim(gene_annotation_file, header = TRUE, sep=",")
png("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-11-27-normalize_expression_GTEx/length.png", width = 1000, height = 1000)
boxplot(log2(length_norm_expression))
dev.off()
boxplot(log2(length_norm_expression))
```
## Format metadata and collect covariates
```{r}
# Set some functions
# This function mean centers the data.
mean_center_clean=function(cols_local,column){
index=which(colnames(cols_local)==column)
cols_local[,index]=cols_local[,index]-mean(cols_local[,index])
return(cols_local)
}
# This function applies mean centers function to desired metadata column. Here we apply it to age.
pretty_up_cols=function(cols){
cols=refactorize(cols)
cols=mean_center_clean(cols,"Age")
return(cols)
}
# This function makes sure that desired metadata columns are factors.
refactorize=function(cols_local){
cols_local$Genotyping_ID=factor(cols_local$Genotyping_ID,levels=unique(as.character(cols_local$Genotyping_ID)))
cols_local$Sample=factor(cols_local$Sample)
cols_local$Condition=factor(cols_local$Condition)
cols_local$Batch=factor(cols_local$Batch)
return(cols_local)
}
```
```{r}
#Load metadata
meta_data = read.table(paste0("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/metadata/RNAseq_metadata.txt"))
```
### Covariates - genotype PCs
```{r}
# Generate genotype PC and update metadata
gtypes = read.table(paste0("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/QTL_mapping/SNP_genotypes.txt"),header = TRUE, stringsAsFactors = FALSE)
samples=colnames(gtypes)
gtypes_pca=data.frame(snp_id=rownames(gtypes),gtypes)
snpgdsCreateGeno(paste0("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/QTL_mapping", "/GDS_genotypes.gds"),
genmat = as.matrix(gtypes_pca[, samples]),
sample.id = unique(samples),
snp.id = gtypes_pca$snp_id,
snpfirstdim=TRUE)
## This command tells you the total number of samples and SNPs in the .gds file.
snpgdsSummary(paste0("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/QTL_mapping", "/GDS_genotypes.gds"))
## Load .gds file in as genofile.
genofile <- snpgdsOpen(paste0("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/QTL_mapping", "/GDS_genotypes.gds"))
## Perform a PCA on genofile/ genotype information.
pca <- snpgdsPCA(genofile)
## Subset the first PC
tab <- data.frame(sample.id = pca$sample.id,
PC1 = pca$eigenvect[,1], # the first eigenvector
stringsAsFactors = FALSE)
rownames(tab) <- tab$sample.id
# individuals <- strsplit(colnames(expression_subset), split="_")[0]
individuals <- sub("_.*", "", colnames(expression_subset))
tab <- tab[tab$sample.id %in% individuals,]
samples_with_gtype_pc <- c(paste0(tab$sample.id, "_Flu"),paste0(tab$sample.id, "_NI"))
gtype_pc <- c(tab$PC1,tab$PC1)
# Include only samples with genotype PCs
meta_data_with_gtype_pc <- meta_data[samples_with_gtype_pc,]
dim(meta_data_with_gtype_pc)
meta_data_with_gtype_pc$gtype_PC1 <- gtype_pc
meta_data <- meta_data_with_gtype_pc
dim(reads)
dim(meta_data)
```
## Prepare data for normalization
```{r}
########################################
### Format reads matrices & metadata ###
########################################
reads <- expression_subset[,samples_with_gtype_pc]
#subset to only include those samples for which there is a 1 in the "PopDEset" & "EQTL_set" column.
meta_data=meta_data[which(meta_data$PopDE_set==1 | meta_data$EQTL_set==1),]
#ensure that the Sample_ID are the rownames of the meta_data dataframe
rownames(meta_data)=meta_data$Sample
#ensure that the meta_data dataframe is ordered alphabetically (based on sample_id)
meta_data=meta_data[order(rownames(meta_data)),]
#make sure that all the columns of the meta data are also in the read counts matrix.
reads=reads[,which(colnames(reads) %in% rownames(meta_data))]
#order the reads count matrix.
reads=reads[,order(colnames(reads))]
#this should be 0. It is a check to ensure that all the metadata and read counts matrix have all the same samples.
length(which(rownames(meta_data)!=colnames(reads)))
# Are metadata and reads the same dimensions?
dim(meta_data)[1]==dim(reads)[2]
# mean center age
meta_data=pretty_up_cols(meta_data)
### Get condition-specific reads matrices
##subset the NI samples only)
reads_NI=reads[,which(meta_data$Condition=="NI")]
##subset the Flu samples only)
reads_FLU=reads[,which(meta_data$Condition=="Flu")]
### Get condition-specific metadata tables
meta_data_NI=refactorize(meta_data[which(meta_data$Condition=="NI"),])
meta_data_FLU=refactorize(meta_data[which(meta_data$Condition=="Flu"),])
```
## Normalization - TMM + Voom
```{r}
######################################################
### Voom normalize reads, remove batch with Combat ###
######################################################
COR=function(reads,meta_data, gene_lengths_norm=c(1), no_voom=FALSE){
dge <- DGEList(counts=reads)
dge <- calcNormFactors(dge)
# cpm_tmm <- cpm(dge,log=TRUE,prior.count=1)
design=model.matrix(~Age+Admixture,data=meta_data)
v <- voom(dge,design,plot=FALSE)
v_no_comb <- voom(dge,design,plot=FALSE)
v_combat = ComBat(dat=as.matrix(v$E), batch=meta_data$Batch, mod=design, par.prior=TRUE)
if (length(gene_lengths_norm) == 1) {
v$E=v_combat
fit <-lmFit(v,design)
fit <- eBayes(fit)
} else {
lnorm_c_combat <- as.matrix(v_combat)
lnorm_c_combat <- 2**lnorm_c_combat - 0.5
lnorm_c_combat <- lnorm_c_combat/gene_lengths_norm
lnorm_c_combat <- log2(lnorm_c_combat + 0.5)
v$E <- lnorm_c_combat
fit <-lmFit(v,design)
fit <- eBayes(fit)
}
return(list(fit, v, v_no_comb))
}
if (length_norm_bool) {
# If length normalization is true then check if normalize before TMM
if (length_norm_before) {
# If so then normalize first
reads_NI = as.data.frame(as.matrix(reads_NI)/exon_lengths)
reads_FLU = as.data.frame(as.matrix(reads_FLU)/exon_lengths)
#Correct for batch within each condition.
COR_NI=COR(reads=reads_NI,gene_lengths=c(1),meta_data=meta_data_NI)
fit_NI=COR_NI[[1]]
v_NI=COR_NI[[2]]
v_NI_no_comb=COR_NI[[3]]
COR_FLU=COR(reads=reads_FLU,gene_lengths=c(1), meta_data=meta_data_FLU)
fit_FLU=COR_FLU[[1]]
v_FLU=COR_FLU[[2]]
v_FLU_no_comb=COR_FLU[[3]]
} else {
#Otherwise provide exon lengths to COR function
#Correct for batch within each condition.
COR_NI=COR(reads=reads_NI,gene_lengths=exon_lengths,meta_data=meta_data_NI)
fit_NI=COR_NI[[1]]
v_NI=COR_NI[[2]]
v_NI_no_comb=COR_NI[[3]]
COR_FLU=COR(reads=reads_FLU,gene_lengths=exon_lengths, meta_data=meta_data_FLU)
fit_FLU=COR_FLU[[1]]
v_FLU=COR_FLU[[2]]
v_FLU_no_comb=COR_FLU[[3]]
}
} else {
# Otherwise just skip gene length normalization
#Correct for batch within each condition.
COR_NI=COR(reads=reads_NI,gene_lengths=c(1),meta_data=meta_data_NI)
fit_NI=COR_NI[[1]]
v_NI=COR_NI[[2]]
v_NI_no_comb=COR_NI[[3]]
COR_FLU=COR(reads=reads_FLU,gene_lengths=c(1), meta_data=meta_data_FLU)
fit_FLU=COR_FLU[[1]]
v_FLU=COR_FLU[[2]]
v_FLU_no_comb=COR_FLU[[3]]
}
```
## Covariates - non-admixture covariates + genotype PCs
```{r}
## Regress out effects of non-admixture covariates within condition
correct_exp=function(v,fit){
corrected_exp=v$E
# corrected_exp=v
## regress out second column in the design matrix which is age
## the first is intercept and third is admixture
indexes=c(2)
for(i in indexes){
corrected_exp <- corrected_exp - fit$coefficients[,i]%*%t(fit$design[,i])
return(corrected_exp)
}
}
corrected_exp_NI=correct_exp(v_NI,fit_NI)
corrected_exp_FLU=correct_exp(v_FLU,fit_FLU)
```
## Covariates - expression PCs
```{r}
###########################################################################
## Build matrixEQTL input: expression tables: regressing out first n PCs ##
###########################################################################
if(expPCs_reg_flu==0){
double_expression_Flu = corrected_exp_FLU
}else{
## Empirically remove PCs from the phenotype data.
pc_set_flu=c(1:expPCs_reg_flu)
## Regress those out.
pca_rm <- function(input_data, pc_set) {
pca = prcomp(t(input_data), na.action = na.omit)
new = input_data
new = apply(new, 1, FUN = function(x){return(lm(as.numeric(x) ~ -1 + pca$x[, as.numeric(pc_set)])$resid)})
new = t(new)
colnames(new) = colnames(input_data)
rownames(new) = rownames(input_data)
return(new)
}
double_expression_Flu = pca_rm(corrected_exp_FLU, pc_set_flu)
}
if(expPCs_reg_ni==0){
double_expression_NI = corrected_exp_NI
}else{
## Empirically remove PCs from the phenotype data.
pc_set_ni=c(1:expPCs_reg_ni)
## Regress those out.
pca_rm <- function(input_data, pc_set) {
pca = prcomp(t(input_data), na.action = na.omit)
new = input_data
new = apply(new, 1, FUN = function(x){return(lm(as.numeric(x) ~ -1 + pca$x[, as.numeric(pc_set)])$resid)})
new = t(new)
colnames(new) = colnames(input_data)
rownames(new) = rownames(input_data)
return(new)
}
double_expression_NI = pca_rm(corrected_exp_NI, pc_set_ni)
}
```
```{r}
## bind corrected read counts back together
all_corrected_exp <- cbind(double_expression_NI, double_expression_Flu)
png("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-11-27-normalize_expression_GTEx/length_library_batch_cov_1.png", width = 1000, height = 1000)
boxplot(all_corrected_exp)
dev.off()
boxplot(all_corrected_exp)
```
```{r}
file <- glue("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/final_testing/fully_preprocessed_expression_{expPCs_reg_flu}_{expPCs_reg_ni}_lnorm{length_norm_bool}_before{length_norm_before}_scale{scale}_source{source}.txt")
write.table(all_corrected_exp, file = file)
```
### Extra: quantile normalization code
While the intention is to not use quantile normalization for the model training/analysis, it will be used here as a control for running SPrediXcan.
```{r}
#Charles' python code
# for i in range(5314, data_XY.shape[1]):
# data_XY.iloc[:, i] = data_XY.iloc[:, i].rank(pct=True)
# R version of above
quantile_norm_expression <- as.data.frame(apply(length_norm_expression, 2, function(x) rank(x, na.last="keep")/sum(!is.na(x))))
boxplot(quantile_norm_expression)
file <- "/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/fully_preprocessed_expression_NO_CORRECTION_QUANTILE.txt"
write.table(quantile_norm_expression, file = file)
```