0.1_normalize_peak_data

Author

Saideep Gona

Published

March 24, 2024

Context

Having successfully run Con-EnPACT on gene expression data, we can now consider doing so on peak data. This notebook will normalize the per-peak read count data for the various modalities in the Aracena et al. study.

How should this be done? The process is very similar to expression data, with the only difference that we do not need to correct for isoform length. What may be of concern, however, is the effective length of read mapping per peak.1 Therefore the steps follow the original Aracena et al. study:

  1. TMM normalization using edgeR
  2. Batch correction ComBat
  3. Voom normalization for mean-variance correction of counts (includes logcpm)

and covariate correction:

  1. Regress out age and admixture, etc.
  2. Just 1 genotyping PCs (based on observed variance explained across individuals)
  3. Expression PCs based on the falling table from the Aracena et al. supplement eQTL Non-infected 1 to 4 Flu-infected 1 to 4 caQTL Non-infected 1 to 3 Flu-infected 1 to 3 H3K27acQTL Non-infected 1 Flu-infected 1 H3K27me3QTL Non-infected 1 Flu-infected 1 to 2 H3K4me1QTL Non-infected 1 to 2 Flu-infected 1 to 4 H3K4me3QTL Non-infected 1 to 2 Flu-infected 1 to 2

Potentially correct for effective legnth by using larger peak regions

Code
library(edgeR)
library(glue)
library(tidyverse)
## Load libraries
library(preprocessCore)
library(limma)
library(edgeR)
library(statmod)
library(sva)
library(reshape2)
library(dplyr)
library(stringr)
library(SNPRelate)

Prepare Metadata and Covariates functions

Code
# Set some functions

# This function mean centers the data.
mean_center_clean=function(cols_local,column){
  index=which(colnames(cols_local)==column)
  cols_local[,index]=cols_local[,index]-mean(cols_local[,index])
  return(cols_local)
}
# This function applies mean centers function to desired metadata column. Here we apply it to age.
pretty_up_cols=function(cols){

  cols=refactorize(cols)
  cols=mean_center_clean(cols,"Age")
  return(cols)
}
# This function makes sure that desired metadata columns are factors.
refactorize=function(cols_local){

  cols_local$Genotyping_ID=factor(cols_local$Genotyping_ID,levels=unique(as.character(cols_local$Genotyping_ID)))
  cols_local$Sample=factor(cols_local$Sample)
  cols_local$Condition=factor(cols_local$Condition)
  cols_local$Batch=factor(cols_local$Batch)
  return(cols_local)
}

build_metadata <- function(path_to_metadata, dataset, genotype_pc) {

  meta_data <- read.table(path_to_metadata, header = TRUE,  row.names = 1)
  dset_individuals <- sub("_.*", "", colnames(dataset))
  genotype_pc_individuals <- sub("_.*", "", genotype_pc$sample.id)
  metadata_individuals <- sub("_.*", "", meta_data$Genotyping_ID)

  common_individuals <- intersect(dset_individuals, genotype_pc_individuals)
  common_individuals <- intersect(common_individuals, metadata_individuals)

  genotype_pc_filt <- genotype_pc[genotype_pc$sample.id %in% common_individuals,]

  samples_with_gtype_pc <- c(paste0(genotype_pc_filt$sample.id, "_Flu"),paste0(genotype_pc_filt$sample.id, "_NI"))
  gtype_pc <- c(genotype_pc_filt$PC1,genotype_pc_filt$PC1)

  meta_data_with_gtype_pc <- meta_data[samples_with_gtype_pc,]

  meta_data_with_gtype_pc$gtype_PC1 <- gtype_pc

  return(list(meta_data_with_gtype_pc, samples_with_gtype_pc))

}

normalization functions

Code
prepare_reads_and_metadata <- function(reads_in, meta_data, samples_with_gtype_pc) {

  reads <- reads_in
  reads <- reads[,samples_with_gtype_pc]

  #subset to only include those samples for which there is a 1 in the "PopDEset" & "EQTL_set" column.
  meta_data=meta_data[which(meta_data$PopDE_set==1 | meta_data$EQTL_set==1),]
  #ensure that the Sample_ID are the rownames of the meta_data dataframe
  rownames(meta_data)=meta_data$Sample
  #ensure that the meta_data dataframe is ordered alphabetically (based on sample_id)
  meta_data=meta_data[order(rownames(meta_data)),]

  #make sure that all the columns of the meta data are also in the read counts matrix.
  reads=reads[,which(colnames(reads) %in% rownames(meta_data))]
  #order the reads count matrix.
  reads=reads[,order(colnames(reads))]

  #this should be 0. It is a check to ensure that all the metadata and read counts matrix have all the same samples.
  length(which(rownames(meta_data)!=colnames(reads)))

  # Are metadata and reads the same dimensions?
  dim(meta_data)[1]==dim(reads)[2]

  # mean center age
  meta_data=pretty_up_cols(meta_data)

  ### Get condition-specific reads matrices
  ##subset the NI samples only)
  reads_NI=reads[,which(meta_data$Condition=="NI")]
  ##subset the Flu samples only)
  reads_Flu=reads[,which(meta_data$Condition=="Flu")]

  ### Get condition-specific metadata tables
  meta_data_NI=refactorize(meta_data[which(meta_data$Condition=="NI"),])
  meta_data_Flu=refactorize(meta_data[which(meta_data$Condition=="Flu"),])

  return(list(reads_Flu, reads_NI, meta_data_Flu, meta_data_NI))
}

tmm_voom_combat <- function(reads,meta_data, no_voom=FALSE){

  dge <- DGEList(counts=reads)
  dge <- calcNormFactors(dge)
  # cpm_tmm <- cpm(dge,log=TRUE,prior.count=1)
  design=model.matrix(~Age+Admixture,data=meta_data)
  v <- voom(dge,design,plot=FALSE)
  v_no_comb <- voom(dge,design,plot=FALSE)
  v_combat = ComBat(dat=as.matrix(v$E), batch=meta_data$Batch, mod=design, par.prior=TRUE)

  v$E=v_combat
  fit <-lmFit(v,design)
  fit <- eBayes(fit)

  return(list(fit, v, v_no_comb))
}

correct_exp=function(v,fit){
  corrected_exp=v$E
  # corrected_exp=v
  ## regress out second column in the design matrix which is age
  ## the first is intercept and third is admixture
  indexes=c(2)

  for(i in indexes){
    corrected_exp <- corrected_exp - fit$coefficients[,i]%*%t(fit$design[,i])
    return(corrected_exp)
  }
}


correct_count_pcs <- function(input_counts, pcs_regressed) {

  if(pcs_regressed==0){
      return(input_counts)
  }else{
      ## Empirically remove PCs from the phenotype data.
      pc_set=c(1:pcs_regressed)

      ## Regress those out.
      pca = prcomp(t(input_counts), na.action = na.omit)
      new = input_counts
      print(dim(new))
      new = apply(new, 1, FUN = function(x){return(lm(as.numeric(x) ~ -1 + pca$x[, as.numeric(pc_set)])$resid)})
      print(dim(new))
      new = t(new)
      colnames(new) = colnames(input_counts)
      rownames(new) = rownames(input_counts)

      return(new)
  }

}

Calculate Genotype PCs

Code
# gtypes = read.table(paste0("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/QTL_mapping/SNP_genotypes.txt"),header = TRUE, stringsAsFactors = FALSE)

# samples=colnames(gtypes)
# gtypes_pca=data.frame(snp_id=rownames(gtypes),gtypes)

# snpgdsCreateGeno(paste0("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/QTL_mapping", "/GDS_genotypes.gds"),
#                  genmat = as.matrix(gtypes_pca[, samples]),
#                  sample.id = unique(samples),
#                  snp.id = gtypes_pca$snp_id,
#                  snpfirstdim=TRUE)

# ## This command tells you the total number of samples and SNPs in the .gds file.
# snpgdsSummary(paste0("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/QTL_mapping", "/GDS_genotypes.gds"))

# ## Load .gds file in as genofile.
# genofile <- snpgdsOpen(paste0("/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/QTL_mapping", "/GDS_genotypes.gds"))

# ## Perform a PCA on genofile/ genotype information. 
# pca <- snpgdsPCA(genofile)
# ## Subset the first PC
# genotype_pc_df <- data.frame(sample.id = pca$sample.id,
#                   PC1 = pca$eigenvect[,1],    # the first eigenvector
#                   stringsAsFactors = FALSE)

# rownames(genotype_pc_df) <- genotype_pc_df$sample.id

Prepare the files and normalization parameters

Code
root_input_files <- "/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/"
root_output_files <- "/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/normalized_peak_data"

raw_peak_files <- list(
  "H3K27ac"=glue("{root_input_files}/counts_matrices/H3K27ac_filtered.counts.txt"),
  "H3K27me3"=glue("{root_input_files}/counts_matrices/H3K27me3_filtered.counts.txt"),
  "H3K4me1"=glue("{root_input_files}/counts_matrices/H3K4me1_filtered.counts.txt"),
  "H3K4me3"=glue("{root_input_files}/counts_matrices/H3K4me3_filtered.counts.txt"),
  "ATAC"=glue("{root_input_files}/counts_matrices/ATACseq_filtered.counts.txt")
) 

raw_metadata_files <- list(
  "H3K27ac"=glue("{root_input_files}/metadata/H3K27ac_metadata_samplewithoutmodality.txt"),
  "H3K27me3"=glue("{root_input_files}/metadata/H3K27me3_metadata_samplewithoutmodality.txt"),
  "H3K4me1"=glue("{root_input_files}/metadata/H3K4me1_metadata_samplewithoutmodality.txt"),
  "H3K4me3"=glue("{root_input_files}/metadata/H3K4me3_metadata_samplewithoutmodality.txt"),
  "ATAC"=glue("{root_input_files}/metadata/ATACseq_metadata_samplewithoutmodality.txt")
)

peak_pcs_NI <- list(
  "H3K27ac"=1,
  "H3K27me3"=1,
  "H3K4me1"=2,
  "H3K4me3"=2,
  "ATAC"=3
)

peak_pcs_Flu <- list(
  "H3K27ac"=1,
  "H3K27me3"=2,
  "H3K4me1"=4,
  "H3K4me3"=2,
  "ATAC"=3
)

Normalize the data

Code
# for (modality in names(raw_peak_files)) {
#   print(modality)

#   expPCs_reg_NI <- peak_pcs_NI[[modality]]
#   expPCs_reg_Flu <- peak_pcs_Flu[[modality]]
#   o_file <- glue("{root_output_files}/fully_preprocessed_{modality}_{expPCs_reg_Flu}_{expPCs_reg_NI}.txt")

#   if (file.exists(o_file)) {
#     next
#   }

#   peak_data_file <- raw_peak_files[[modality]]
#   metadata_file <- raw_metadata_files[[modality]]

#   peak_data <- read.table(peak_data_file, header = TRUE, row.names = 1, stringsAsFactors = FALSE)

#   metadata_o <- build_metadata(metadata_file, peak_data, genotype_pc_df)
#   metadata <- metadata_o[[1]]
#   samples_with_gtype_pc <- metadata_o[[2]]

#   reads_and_metadata <- prepare_reads_and_metadata(peak_data, metadata, samples_with_gtype_pc)
#   reads_Flu <- reads_and_metadata[[1]]
#   reads_NI <- reads_and_metadata[[2]]
#   meta_data_Flu <- reads_and_metadata[[3]]
#   meta_data_NI <- reads_and_metadata[[4]]

#   # Perform tmm_voom_combat

#   tvc_NI <- tmm_voom_combat(reads=reads_NI,meta_data=meta_data_NI)
#   fit_NI <- tvc_NI[[1]]
#   v_NI <- tvc_NI[[2]]
#   v_NI_no_comb <- tvc_NI[[3]]

#   tvc_Flu <- tmm_voom_combat(reads=reads_Flu, meta_data=meta_data_Flu)
#   fit_Flu <- tvc_Flu[[1]]
#   v_Flu <- tvc_Flu[[2]]
#   v_Flu_no_comb <- tvc_Flu[[3]]

#   # Correct some covariates

#   corrected_exp_NI <- correct_exp(v_NI,fit_NI)
#   corrected_exp_Flu <- correct_exp(v_Flu,fit_Flu)

#   # Correct signal PCs

#   corrected_exp_NI_both <- correct_count_pcs(corrected_exp_NI, expPCs_reg_NI)

#   corrected_exp_Flu_both <- correct_count_pcs(corrected_exp_Flu, expPCs_reg_Flu)

#   all_corrected_exp <- cbind(corrected_exp_NI_both, corrected_exp_Flu_both)

#   write.table(all_corrected_exp, file = o_file)

# }