This notebook provides instructions on running acNMF starting with a (genes x cells) counts matrix of expression values. In this example we will use simulated data from Kotliar et. al, 2019, but a raw counts matrix is all that is needed to begin.
The matrix must first be split into two roughly equal parts (referred to interchangeably as splits), such that approximately equal numbers of cells are contained in each split.
library(Seurat)
library(SeuratDisk)
set.seed(37645)
data <- readRDS("cNMF_simulated.RDS")
data <- as.data.frame(data)
dim(data)
## [1] 15000 25000
dummy_sep <- rbinom(nrow(data), 1, 0.5)
split0 <- data[dummy_sep == 0, ]
split1 <- data[dummy_sep == 1, ]
dim(split0)
## [1] 7516 25000
dim(split1)
## [1] 7484 25000
split0 <- t(as.matrix(split0))
split1 <- t(as.matrix(split1))
s0 <- CreateSeuratObject(counts = split0)
s1 <- CreateSeuratObject(counts = split1)
SaveH5Seurat(s0, filename = "cnmf_input_benchmark_split0.h5Seurat")
SaveH5Seurat(s1, filename = "cnmf_input_benchmark_split1.h5Seurat")
Convert("cnmf_input_benchmark_split0.h5Seurat", dest = "h5ad", assay = "RNA")
Convert("cnmf_input_benchmark_split1.h5Seurat", dest = "h5ad", assay = "RNA")
Run cNMF on each file generated above. This needs to be performed over a range of ranks. The following python code was used on split0.
import csv
import os
import pandas as pd
import numpy as np
from scipy.io import mmread
import scipy.sparse as sp
import matplotlib.pyplot as plt
import scanpy as sc
from IPython.display import Image
np.random.seed(14)
########################
#Set cNMF run parameters
########################
numiter = 200
numhvgenes = 2000
K = ' '.join([str(i) for i in range(2,61)] + [str(i) for i in range(70,210,10)])
K_int = [int(i) for i in K.split()]
numworkers = numiter*len(K_int)
output_directory = "/<path>/<to>/<output_dir>/"
run_name = 'cNMF_split0'
countfn = "cnmf_input_benchmark_split0.h5ad"
seed = 14
###############################
#Set up directories for LSF i/o
###############################
logdir = os.getcwd() + '/split0_logdir'
os.system('mkdir ' + logdir)
factorize_dir = logdir + '/factorize'
os.system('mkdir ' + factorize_dir)
combine_dir = logdir + '/combine'
os.system('mkdir ' + combine_dir)
consensus_dir = logdir + '/consensus'
os.system('mkdir ' + consensus_dir)
##############
#Preprocessing
##############
prepare_cmd = 'python /home/rchapple/software/cNMF/cnmf_v2.0.py prepare --output-dir %s --name %s -c %s -k %s --n-iter %d --total-workers %d --seed %d --numgenes %d --beta-loss frobenius' % (output_directory, run_name, countfn, K, numiter, numworkers, seed, numhvgenes)
print('Prepare command assuming parallelization with %d tasks:\n%s' % (numworkers, prepare_cmd))
os.system(prepare_cmd)
##############
#Factorization
##############
worker_index = ' '.join([str(x) for x in range(numworkers)])
#Set up the job submission array
#Each job runs all iterations of NMF for each rank in one instance of opening the cNMF_v2.0.py script
#The number of jobs that get submitted to HPC are equivalent to the number of ranks
start = [int(i) for i in range(0, numworkers-1, numiter)]
end = [int(i) for i in range(numiter-1, numworkers, numiter)]
nmf_job_data = {'K':pd.Series(K_int), 'Start':pd.Series(start), 'End':pd.Series(end)}
nmf_job_submission = pd.DataFrame(nmf_job_data)
for x in nmf_job_submission.index:
factorize_cmd = "bsub -P RC -J split0_factorize -R \"span[hosts=1] rusage[mem=10GB]\" -oo %s -eo %s 'python /home/rchapple/software/cNMF/cnmf_v2.0.py factorize --output-dir %s --name %s --jobstart %d --jobend %d'" % (factorize_dir, factorize_dir, output_directory, run_name, nmf_job_submission['Start'][x], nmf_job_submission['End'][x])
print('Factorize command to run factorizations for rank = %d across all iterations' % (nmf_job_submission['K'][x]))
os.system(factorize_cmd)
wait = "bwait -w 'ended(split0_factorize)'"
os.system(wait)
#################################
#Combine factorization replicates
#################################
combine_cmd = "bsub -P RC -J split0_combine -R\"rusage[mem=400GB]\" -q large_mem -oo %s -eo %s 'python /home/rchapple/software/cNMF/cnmf_v2.0.py combine --output-dir %s --name %s'" % (combine_dir, combine_dir, output_directory, run_name)
print(combine_cmd)
print('\n')
os.system(combine_cmd)
wait_combine = "bwait -w 'ended(split0_combine)'"
os.system(wait_combine)
##########################################################
#Cluster and plot consensus programs and KNN outlier plots
##########################################################
from itertools import chain
density_threshold = 2.00
for x in chain(range(2,61), range(70,210,10)):
selected_K = x
print("Selected_K =", selected_K, "\n")
consensus_cmd = "bsub -P RC -J split0_consensus -R\"rusage[mem=400GB]\" -q large_mem -oo %s -eo %s 'python /home/rchapple/software/cNMF/cnmf_v2.0.py consensus --output-dir %s --name %s --local-density-threshold %.2f --components %d --show-clustering'" % (consensus_dir, consensus_dir, output_directory, run_name, density_threshold, selected_K)
print('Consensus command for K=%d:\n%s' % (selected_K, consensus_cmd))
os.system(consensus_cmd)
density_threshold_str = ('%.2f' % density_threshold).replace('.', '_')
#Outlier filtering
density_threshold = 0.10
for x in chain(range(2,61), range(70,210,10)):
selected_K = x
print("Selected_K =", selected_K, "\n")
consensus_cmd = "bsub -P RC -J split0_consensus_outlier -R\"rusage[mem=400GB]\" -q large_mem -oo %s -eo %s 'python /home/rchapple/software/cNMF/cnmf_v2.0.py consensus --output-dir %s --name %s --local-density-threshold %.2f --components %d --show-clustering'" % (consensus_dir, consensus_dir, output_directory, run_name, density_threshold, selected_K)
print('Consensus command for K=%d:\n%s' % (selected_K, consensus_cmd))
os.system(consensus_cmd)
density_threshold_str = ('%.2f' % density_threshold).replace('.', '_')
Next, we evaluate (for each rank) the similarity of the two data splits. The Jaccard similarity index is calculated, and significant GEPs between the splits are represented as node pairs in a network graph. Community detection on the underlying graph is calculated, and recorded.
library(jaccard)
library(igraph)
library(vroom)
set.seed(1234)
#This function computes the Jaccard index and significance test for all programs in each benchmark split
#Once this is completed, a network graph is built using igraph package
#Community detection of the network graph is then calculated using betweeness scores
calcSplitStats <- function(split0, split1, k, jval){
print("calcSplitStats")
# for each row of split0, compare it to every row of split1
jaccardInd <- numeric()
jaccardIndPs <- numeric()
for(i in 1:ncol(split0)){
for(j in 1:ncol(split1)){
GEP_split0 = as.numeric(rank(-split0[, i]) < jval)
GEP_split1 = as.numeric(rank(-split1[, j]) < jval)
jaccardTest <- jaccard.test.mca(GEP_split0, GEP_split1)
jaccardInd <- c(jaccardInd, jaccardTest$statistics)
jaccardIndPs <- c(jaccardIndPs, jaccardTest$pvalue)
}
}
# Calculate FWERs
k <- as.numeric(k)
fwerJaccard <- p.adjust(jaccardIndPs, method="bonferroni")
fwerJaccard <- t(matrix(fwerJaccard, nrow = k, ncol = k))
# Calculate number of significant findings.
sigFwerJaccard <- which(fwerJaccard < 0.05, arr.ind = T)
#Calculate community number
if(nrow(sigFwerJaccard) > 0){
community.df <- as.data.frame(sigFwerJaccard)
community.df$row <- paste0("Split0_GEP", community.df$row)
community.df$col <- paste0("Split1_GEP", community.df$col)
community.graph <- graph_from_data_frame(community.df, directed = F)
ceb <- cluster_edge_betweenness(community.graph)
community_number <- length(communities(ceb))
}else{
community_number <- 0
}
return(list(jaccardIndPs=jaccardIndPs, jaccardInd=jaccardInd, sigFwerJaccard=sigFwerJaccard, jaccard.com=community_number))
}
ranks = c(seq(2,60), seq(70,200,10))
data.dir <- "/<path>/<to>/<cNMF_output>"
jaccard.val <- seq(10,100,10)
for(k in ranks){
#Results list
#Gets written to RDS file at end of R script
jaccardtest_results <- list()
#Read in NMF gene spectra scores
print(paste0("Rank: ", k))
split0 <- t(vroom(file = paste0(data.dir, "cNMF_split0/cNMF_split0.gene_spectra_score.k_", k, ".dt_0_10.txt"), col_select = c(-...1), show_col_types = FALSE))
split1 <- t(vroom(file = paste0(data.dir, "cNMF_split1/cNMF_split1.gene_spectra_score.k_", k, ".dt_0_10.txt"), col_select = c(-...1), show_col_types = FALSE))
#Compute jaccard index for different jaccard lengths
for(jval in jaccard.val){
print(paste0("Jaccard Length: ", jval))
splitstatlist <- list()
splitstatlist <- calcSplitStats(split0, split1, k, jval)
jval.name <- as.character(jval)
jaccardtest_results[[jval.name]] <- splitstatlist
}
#This list contains the computations across all jaccard lengths for the rank provided on the command line
saveRDS(jaccardtest_results, file = paste0("jaccardtest_results_K", k, ".RDS"))
}
library(ggplot2)
library(dplyr)
simdata.dir <- "/<path>/<to>/<jaccardtest_results>/"
sim.df <- NULL
jaccardtest_results <- list()
for(k in ranks){
splitstats <- readRDS(file = paste0(simdata.dir, seed, "/results/jaccardtest_results_K", k, ".RDS"))
jaccardtest_results[[k]] <- splitstats
}
jaccard.val <- as.character(jaccard.val)
#for plotting
com.df <- NULL
for(x in ranks){
u <- NULL
for(y in jaccard.val){
u <- c(u, jaccardtest_results[[x]][[y]][["jaccard.com"]])
}
u <- cbind(u, x)
u <- data.frame(u, jaccard.val)
com.df <- rbind(com.df, u)
}
colnames(com.df) <- c("Community_Number", "Rank", "Jaccard_Length")
#rank on x axis
com.df$Rank <- as.integer(com.df$Rank)
com.df$Jaccard_Length <- as.character(com.df$Jaccard_Length)
com.df$Jaccard_Length <- factor(com.df$Jaccard_Length, levels = c("10", "20", "30", "40", "50", "60", "70", "80", "90", "100"))
sim_plot <- ggplot(com.df, aes(x=Rank,y=Community_Number, group = Jaccard_Length, color = Jaccard_Length)) +
stat_smooth(method="loess", span=0.2, se=TRUE, aes(fill=Jaccard_Length), alpha=0.3) +
theme_linedraw()
sim_plot
## `geom_smooth()` using formula = 'y ~ x'
This block of code evaluates which curve has the best inflection point
devtools::install_github("ahasverus/elbow")
library(elbow)
jl.eval <- data.frame()
for(x in jaccard.val){
x <- as.numeric(x)
jl.df <- com.df %>% filter(Jaccard_Length == x)
#Run a loess regression on JL data to smooth the curve
sl <- jl.df %>% ggplot(aes(Rank, Community_Number))
+ geom_point()
+ geom_smooth(method = "loess", span = 0.3, method.args = list(degree = 1))
gbuild <- ggplot_build(sl)
elbow.df <- data.frame(X = gbuild$data[[2]]$x, Y = gbuild$data[[2]]$y)
#Find inflection point
ipoint <- elbow(data = elbow.df, plot = F)
#Since the data was run on smoothed data, retreive the closest rank to the x intercept calculated by elbow package
optimum.rank <- ranks[which.min(abs(ranks - ipoint$X_selected))]
#Run linear regression on cost values (X values after inflection point) and determine slope
ipoint.filtered <- ipoint$data %>% filter(X >= ipoint$X_selected)
reg <- lm(ipoint.filtered$benefits ~ ipoint.filtered$X)
jl.eval <- rbind(jl.eval, c(x, optimum.rank, reg$coefficients[2]))
}
The following plots were generated when the Jaccard length was set to 10
sl
## `geom_smooth()` using formula 'y ~ x'
elbow(data=elbow.df)
## $X_selected
## [1] 22.05063
##
## $data
## X Y constant benefits
## 1 2.000000 1.098521 1.099 1.098521
## 2 4.506329 3.604137 1.247 3.455521
## 3 7.012658 6.032960 1.396 5.735521
## 4 9.518987 8.400256 1.545 7.953521
## 5 12.025316 10.597855 1.693 10.003521
## 6 14.531646 12.164626 1.842 11.421521
## 7 17.037975 13.074135 1.991 12.181521
## 8 19.544304 13.556808 2.140 12.515521
## 9 22.050633 13.707028 2.288 12.517521
## 10 24.556962 13.639138 2.437 12.300521
## 11 27.063291 13.446832 2.586 11.959521
## 12 29.569620 13.192802 2.735 11.556521
## 13 32.075949 13.033196 2.883 11.248521
## 14 34.582278 12.981633 3.032 11.048521
## 15 37.088608 12.988900 3.181 10.906521
## 16 39.594937 12.990984 3.330 10.759521
## 17 42.101266 12.867255 3.478 10.487521
## 18 44.607595 12.675900 3.627 10.147521
## 19 47.113924 12.443428 3.776 9.765521
## 20 49.620253 12.287690 3.924 9.462521
## 21 52.126582 12.354306 4.073 9.379521
## 22 54.632911 12.503159 4.222 9.379521
## 23 57.139241 12.636458 4.371 9.363521
## 24 59.645570 12.734758 4.519 9.314521
## 25 62.151899 12.680946 4.668 9.111521
## 26 64.658228 12.534118 4.817 8.815521
## 27 67.164557 12.367695 4.966 8.500521
## 28 69.670886 12.255096 5.114 8.239521
## 29 72.177215 12.211918 5.263 8.047521
## 30 74.683544 12.174566 5.412 7.861521
## 31 77.189873 12.141634 5.561 7.679521
## 32 79.696203 12.112824 5.709 7.502521
## 33 82.202532 12.087840 5.858 7.328521
## 34 84.708861 12.066385 6.007 7.157521
## 35 87.215190 12.048160 6.155 6.991521
## 36 89.721519 12.032869 6.304 6.827521
## 37 92.227848 12.020214 6.453 6.665521
## 38 94.734177 12.009898 6.602 6.506521
## 39 97.240506 12.001625 6.750 6.350521
## 40 99.746835 11.995096 6.899 6.194521
## 41 102.253165 11.990015 7.048 6.040521
## 42 104.759494 11.986084 7.197 5.887521
## 43 107.265823 11.983005 7.345 5.736521
## 44 109.772152 11.980483 7.494 5.584521
## 45 112.278481 11.979925 7.643 5.435521
## 46 114.784810 11.983365 7.791 5.290521
## 47 117.291139 11.990402 7.940 5.148521
## 48 119.797468 12.000619 8.089 5.010521
## 49 122.303797 12.013596 8.238 4.874521
## 50 124.810127 12.028916 8.386 4.741521
## 51 127.316456 12.046161 8.535 4.609521
## 52 129.822785 12.064911 8.684 4.479521
## 53 132.329114 12.084750 8.833 4.350521
## 54 134.835443 12.105259 8.981 4.222521
## 55 137.341772 12.126019 9.130 4.094521
## 56 139.848101 12.146613 9.279 3.966521
## 57 142.354430 12.168721 9.428 3.839521
## 58 144.860759 12.194274 9.576 3.716521
## 59 147.367089 12.222381 9.725 3.595521
## 60 149.873418 12.252145 9.874 3.476521
## 61 152.379747 12.282669 10.022 3.359521
## 62 154.886076 12.313052 10.171 3.240521
## 63 157.392405 12.342399 10.320 3.120521
## 64 159.898734 12.369809 10.469 2.999521
## 65 162.405063 12.396275 10.617 2.877521
## 66 164.911392 12.423624 10.766 2.756521
## 67 167.417722 12.451767 10.915 2.635521
## 68 169.924051 12.480615 11.064 2.515521
## 69 172.430380 12.510075 11.212 2.396521
## 70 174.936709 12.540056 11.361 2.277521
## 71 177.443038 12.570466 11.510 2.158521
## 72 179.949367 12.601215 11.659 2.040521
## 73 182.455696 12.632210 11.807 1.923521
## 74 184.962025 12.663361 11.956 1.805521
## 75 187.468354 12.694575 12.105 1.688521
## 76 189.974684 12.725762 12.253 1.571521
## 77 192.481013 12.756830 12.402 1.453521
## 78 194.987342 12.787687 12.551 1.335521
## 79 197.493671 12.818243 12.700 1.216521
## 80 200.000000 12.848406 12.848 1.098521
This programmatically determines the most optimal parameters (rank and Jaccard length) for the dataset
colnames(jl.eval) <- c("JL", "Rank", "Slope")
#This will determine which JL curve gives the steepest slope for the cost values
max.slope.params <- jl.eval %>% filter(JL == jl.eval$JL[jl.eval$Slope == min(jl.eval$Slope)])
comnum <- com.df %>% filter(Jaccard_Length == max.slope.params$JL & Rank == max.slope.params$Rank)
params <- as.data.frame(cbind(comnum$Community_Number, max.slope.params$Rank, max.slope.params$JL))
colnames(params) <- c("Community_Number", "Rank", "Jaccard_Length")
optimized.community.params[[<dataset.name>]] <- params
#Get the significant matches(indeces) from the Jaccard test
sig.labels <- as.data.frame(jaccardtest_results[[params$Rank]][[as.character(params$Jaccard_Length)]]$sigFwerJaccard)
sig.labels$row <- paste0("Split0_GEP", sig.labels$row)
sig.labels$col <- paste0("Split1_GEP", sig.labels$col)
com.labels[[<dataset.name>] <- sig.labels
saveRDS(optimized.community.params, file = "optimized_community_paramters.RDS")
saveRDS(com.labels, file = "community_labels.RDS")
Optimal parameters
optimized_community_parameters
## Community_Number Rank Jaccard_Length
## 1 14 22 10
Significant pairs of replicated GEPs across the splits
com.labels
## row col
## 1 Split0_GEP10 Split1_GEP2
## 2 Split0_GEP13 Split1_GEP3
## 3 Split0_GEP2 Split1_GEP4
## 4 Split0_GEP14 Split1_GEP6
## 5 Split0_GEP15 Split1_GEP7
## 6 Split0_GEP4 Split1_GEP8
## 7 Split0_GEP21 Split1_GEP9
## 8 Split0_GEP16 Split1_GEP10
## 9 Split0_GEP8 Split1_GEP11
## 10 Split0_GEP1 Split1_GEP12
## 11 Split0_GEP12 Split1_GEP14
## 12 Split0_GEP5 Split1_GEP15
## 13 Split0_GEP6 Split1_GEP17
## 14 Split0_GEP9 Split1_GEP21