Getting Started

This notebook provides instructions on running acNMF starting with a (genes x cells) counts matrix of expression values. In this example we will use simulated data from Kotliar et. al, 2019, but a raw counts matrix is all that is needed to begin.

Splitting Data

The matrix must first be split into two roughly equal parts (referred to interchangeably as splits), such that approximately equal numbers of cells are contained in each split.

Load data and split

library(Seurat)
library(SeuratDisk)
set.seed(37645)
data <- readRDS("cNMF_simulated.RDS")
data <- as.data.frame(data)
dim(data)
## [1] 15000 25000
dummy_sep <- rbinom(nrow(data), 1, 0.5)
split0 <- data[dummy_sep == 0, ]
split1 <- data[dummy_sep == 1, ]

dim(split0)
## [1]  7516 25000
dim(split1)
## [1]  7484 25000

Convert to H5AD

split0 <- t(as.matrix(split0))
split1 <- t(as.matrix(split1))

s0 <- CreateSeuratObject(counts = split0)
s1 <- CreateSeuratObject(counts = split1)

SaveH5Seurat(s0, filename = "cnmf_input_benchmark_split0.h5Seurat")
SaveH5Seurat(s1, filename = "cnmf_input_benchmark_split1.h5Seurat")

Convert("cnmf_input_benchmark_split0.h5Seurat", dest = "h5ad", assay = "RNA")
Convert("cnmf_input_benchmark_split1.h5Seurat", dest = "h5ad", assay = "RNA")

Running cNMF on Individual Splits

Run cNMF on each file generated above. This needs to be performed over a range of ranks. The following python code was used on split0.

import csv
import os
import pandas as pd
import numpy as np
from scipy.io import mmread
import scipy.sparse as sp
import matplotlib.pyplot as plt
import scanpy as sc
from IPython.display import Image

np.random.seed(14)

########################
#Set cNMF run parameters
########################

numiter = 200
numhvgenes = 2000

K = ' '.join([str(i) for i in range(2,61)] + [str(i) for i in range(70,210,10)])
K_int = [int(i) for i in K.split()]

numworkers = numiter*len(K_int)

output_directory = "/<path>/<to>/<output_dir>/"
run_name = 'cNMF_split0'

countfn = "cnmf_input_benchmark_split0.h5ad"

seed = 14

###############################
#Set up directories for LSF i/o
###############################

logdir = os.getcwd() + '/split0_logdir'
os.system('mkdir ' + logdir)
factorize_dir = logdir + '/factorize'
os.system('mkdir ' + factorize_dir)
combine_dir = logdir + '/combine'
os.system('mkdir ' + combine_dir)
consensus_dir = logdir + '/consensus'
os.system('mkdir ' + consensus_dir)

##############
#Preprocessing
##############

prepare_cmd = 'python /home/rchapple/software/cNMF/cnmf_v2.0.py prepare --output-dir %s --name %s -c %s -k %s --n-iter %d --total-workers %d --seed %d --numgenes %d --beta-loss frobenius' % (output_directory, run_name, countfn, K, numiter, numworkers, seed, numhvgenes)
print('Prepare command assuming parallelization with %d tasks:\n%s' % (numworkers, prepare_cmd))
os.system(prepare_cmd)

##############
#Factorization
##############

worker_index = ' '.join([str(x) for x in range(numworkers)])

#Set up the job submission array
#Each job runs all iterations of NMF for each rank in one instance of opening the cNMF_v2.0.py script
#The number of jobs that get submitted to HPC are equivalent to the number of ranks

start = [int(i) for i in range(0, numworkers-1, numiter)]
end = [int(i) for i in range(numiter-1, numworkers, numiter)]
nmf_job_data = {'K':pd.Series(K_int), 'Start':pd.Series(start), 'End':pd.Series(end)}
nmf_job_submission = pd.DataFrame(nmf_job_data)

for x in nmf_job_submission.index:
    factorize_cmd = "bsub -P RC -J split0_factorize -R \"span[hosts=1] rusage[mem=10GB]\" -oo %s -eo %s 'python /home/rchapple/software/cNMF/cnmf_v2.0.py factorize --output-dir %s --name %s --jobstart %d --jobend %d'" % (factorize_dir, factorize_dir, output_directory, run_name, nmf_job_submission['Start'][x], nmf_job_submission['End'][x])
    print('Factorize command to run factorizations for rank = %d across all iterations' % (nmf_job_submission['K'][x]))
    os.system(factorize_cmd)

wait = "bwait -w 'ended(split0_factorize)'"
os.system(wait)

#################################
#Combine factorization replicates
#################################

combine_cmd = "bsub -P RC -J split0_combine -R\"rusage[mem=400GB]\" -q large_mem -oo %s -eo %s 'python /home/rchapple/software/cNMF/cnmf_v2.0.py combine --output-dir %s --name %s'" % (combine_dir, combine_dir, output_directory, run_name)
print(combine_cmd)
print('\n')
os.system(combine_cmd)

wait_combine = "bwait -w 'ended(split0_combine)'"
os.system(wait_combine)


##########################################################
#Cluster and plot consensus programs and KNN outlier plots
##########################################################

from itertools import chain

density_threshold = 2.00

for x in chain(range(2,61), range(70,210,10)):
    selected_K = x
    print("Selected_K =", selected_K, "\n")
    consensus_cmd = "bsub -P RC -J split0_consensus -R\"rusage[mem=400GB]\" -q large_mem -oo %s -eo %s 'python /home/rchapple/software/cNMF/cnmf_v2.0.py consensus --output-dir %s --name %s --local-density-threshold %.2f --components %d --show-clustering'" % (consensus_dir, consensus_dir, output_directory, run_name, density_threshold, selected_K)
    print('Consensus command for K=%d:\n%s' % (selected_K, consensus_cmd))
    os.system(consensus_cmd)

density_threshold_str = ('%.2f' % density_threshold).replace('.', '_')

#Outlier filtering

density_threshold = 0.10                                                        

for x in chain(range(2,61), range(70,210,10)):                                                       
    selected_K = x                                                              
    print("Selected_K =", selected_K, "\n")                                     
    consensus_cmd = "bsub -P RC -J split0_consensus_outlier -R\"rusage[mem=400GB]\" -q large_mem -oo %s -eo %s 'python /home/rchapple/software/cNMF/cnmf_v2.0.py consensus --output-dir %s --name %s --local-density-threshold %.2f --components %d --show-clustering'" % (consensus_dir, consensus_dir, output_directory, run_name, density_threshold, selected_K)
    print('Consensus command for K=%d:\n%s' % (selected_K, consensus_cmd)) 
    os.system(consensus_cmd)
                                                        
density_threshold_str = ('%.2f' % density_threshold).replace('.', '_')

Post-cNMF Comparison of Each Data Split

Next, we evaluate (for each rank) the similarity of the two data splits. The Jaccard similarity index is calculated, and significant GEPs between the splits are represented as node pairs in a network graph. Community detection on the underlying graph is calculated, and recorded.

SplitStats function

library(jaccard)
library(igraph)
library(vroom)

set.seed(1234)

#This function computes the Jaccard index and significance test for all programs in each benchmark split
#Once this is completed, a network graph is built using igraph package
#Community detection of the network graph is then calculated using betweeness scores

calcSplitStats <- function(split0, split1, k, jval){
    print("calcSplitStats")
    # for each row of split0, compare it to every row of split1
    jaccardInd <- numeric()
    jaccardIndPs <- numeric()
    
    for(i in 1:ncol(split0)){
        for(j in 1:ncol(split1)){
            GEP_split0 = as.numeric(rank(-split0[, i]) < jval)
            GEP_split1 = as.numeric(rank(-split1[, j]) < jval)
            jaccardTest <- jaccard.test.mca(GEP_split0, GEP_split1)
            jaccardInd <- c(jaccardInd, jaccardTest$statistics)
            jaccardIndPs <- c(jaccardIndPs, jaccardTest$pvalue)
        }
    }
  
    # Calculate FWERs
    k <- as.numeric(k) 
    fwerJaccard <- p.adjust(jaccardIndPs, method="bonferroni")
    fwerJaccard <- t(matrix(fwerJaccard, nrow = k, ncol = k))
  
    # Calculate number of significant findings.
    sigFwerJaccard <- which(fwerJaccard < 0.05, arr.ind = T)
  
    #Calculate community number
    if(nrow(sigFwerJaccard) > 0){
        community.df <- as.data.frame(sigFwerJaccard)
        community.df$row <- paste0("Split0_GEP", community.df$row)
        community.df$col <- paste0("Split1_GEP", community.df$col)
        community.graph <- graph_from_data_frame(community.df, directed = F)
        ceb <- cluster_edge_betweenness(community.graph)
        community_number <- length(communities(ceb))
    }else{
        community_number <- 0
    }
    return(list(jaccardIndPs=jaccardIndPs, jaccardInd=jaccardInd, sigFwerJaccard=sigFwerJaccard, jaccard.com=community_number))
}

Calculating Jaccard similarity index for a range of Jaccard values

ranks = c(seq(2,60), seq(70,200,10))
data.dir <- "/<path>/<to>/<cNMF_output>"
jaccard.val <- seq(10,100,10)

for(k in ranks){
  #Results list
  #Gets written to RDS file at end of R script
  jaccardtest_results <- list()

  #Read in NMF gene spectra scores  
  print(paste0("Rank: ", k))
  split0 <- t(vroom(file = paste0(data.dir, "cNMF_split0/cNMF_split0.gene_spectra_score.k_", k, ".dt_0_10.txt"), col_select = c(-...1), show_col_types = FALSE))
  split1 <- t(vroom(file = paste0(data.dir, "cNMF_split1/cNMF_split1.gene_spectra_score.k_", k, ".dt_0_10.txt"), col_select = c(-...1), show_col_types = FALSE))
  
#Compute jaccard index for different jaccard lengths
  for(jval in jaccard.val){
      print(paste0("Jaccard Length: ", jval))
      splitstatlist <- list()
      splitstatlist <- calcSplitStats(split0, split1, k, jval) 
      jval.name <- as.character(jval)
      jaccardtest_results[[jval.name]] <- splitstatlist
  }      
  
  #This list contains the computations across all jaccard lengths for the rank provided on the command line
  saveRDS(jaccardtest_results, file = paste0("jaccardtest_results_K", k, ".RDS"))
}

Plotting Results

library(ggplot2)
library(dplyr)

simdata.dir <- "/<path>/<to>/<jaccardtest_results>/"
sim.df <- NULL

jaccardtest_results <- list()

for(k in ranks){
  splitstats <- readRDS(file = paste0(simdata.dir, seed, "/results/jaccardtest_results_K", k, ".RDS"))
  jaccardtest_results[[k]] <- splitstats
}

jaccard.val <- as.character(jaccard.val)

#for plotting
com.df <- NULL

for(x in ranks){
  u <- NULL
  for(y in jaccard.val){
    u <- c(u, jaccardtest_results[[x]][[y]][["jaccard.com"]]) 
  }
  u <- cbind(u, x)
  u <- data.frame(u, jaccard.val)
  com.df <- rbind(com.df, u)
}
    
colnames(com.df) <- c("Community_Number", "Rank", "Jaccard_Length")

#rank on x axis
com.df$Rank <- as.integer(com.df$Rank)
com.df$Jaccard_Length <- as.character(com.df$Jaccard_Length)
com.df$Jaccard_Length <- factor(com.df$Jaccard_Length, levels = c("10", "20", "30", "40", "50", "60", "70", "80", "90", "100"))


sim_plot <- ggplot(com.df, aes(x=Rank,y=Community_Number, group = Jaccard_Length, color = Jaccard_Length)) +
                stat_smooth(method="loess", span=0.2, se=TRUE, aes(fill=Jaccard_Length), alpha=0.3) +
                theme_linedraw()
sim_plot
## `geom_smooth()` using formula = 'y ~ x'

This block of code evaluates which curve has the best inflection point

devtools::install_github("ahasverus/elbow")
library(elbow)

jl.eval <- data.frame()
  
for(x in jaccard.val){
  x <- as.numeric(x)  
  jl.df <- com.df %>% filter(Jaccard_Length == x)
  #Run a loess regression on JL data to smooth the curve
  sl <- jl.df %>% ggplot(aes(Rank, Community_Number)) 
                      + geom_point() 
                      + geom_smooth(method = "loess", span = 0.3, method.args = list(degree = 1))
  gbuild <- ggplot_build(sl)
  elbow.df <- data.frame(X = gbuild$data[[2]]$x, Y = gbuild$data[[2]]$y)
  #Find inflection point
  ipoint <- elbow(data = elbow.df, plot = F)
  #Since the data was run on smoothed data, retreive the closest rank to the x intercept calculated by elbow package
  optimum.rank <- ranks[which.min(abs(ranks - ipoint$X_selected))]
  #Run linear regression on cost values (X values after inflection point) and determine slope
  ipoint.filtered <- ipoint$data %>% filter(X >= ipoint$X_selected)
  reg <- lm(ipoint.filtered$benefits ~ ipoint.filtered$X)
  jl.eval <- rbind(jl.eval, c(x, optimum.rank, reg$coefficients[2]))
}

The following plots were generated when the Jaccard length was set to 10

sl
## `geom_smooth()` using formula 'y ~ x'

elbow(data=elbow.df)

## $X_selected
## [1] 22.05063
## 
## $data
##             X         Y constant  benefits
## 1    2.000000  1.098521    1.099  1.098521
## 2    4.506329  3.604137    1.247  3.455521
## 3    7.012658  6.032960    1.396  5.735521
## 4    9.518987  8.400256    1.545  7.953521
## 5   12.025316 10.597855    1.693 10.003521
## 6   14.531646 12.164626    1.842 11.421521
## 7   17.037975 13.074135    1.991 12.181521
## 8   19.544304 13.556808    2.140 12.515521
## 9   22.050633 13.707028    2.288 12.517521
## 10  24.556962 13.639138    2.437 12.300521
## 11  27.063291 13.446832    2.586 11.959521
## 12  29.569620 13.192802    2.735 11.556521
## 13  32.075949 13.033196    2.883 11.248521
## 14  34.582278 12.981633    3.032 11.048521
## 15  37.088608 12.988900    3.181 10.906521
## 16  39.594937 12.990984    3.330 10.759521
## 17  42.101266 12.867255    3.478 10.487521
## 18  44.607595 12.675900    3.627 10.147521
## 19  47.113924 12.443428    3.776  9.765521
## 20  49.620253 12.287690    3.924  9.462521
## 21  52.126582 12.354306    4.073  9.379521
## 22  54.632911 12.503159    4.222  9.379521
## 23  57.139241 12.636458    4.371  9.363521
## 24  59.645570 12.734758    4.519  9.314521
## 25  62.151899 12.680946    4.668  9.111521
## 26  64.658228 12.534118    4.817  8.815521
## 27  67.164557 12.367695    4.966  8.500521
## 28  69.670886 12.255096    5.114  8.239521
## 29  72.177215 12.211918    5.263  8.047521
## 30  74.683544 12.174566    5.412  7.861521
## 31  77.189873 12.141634    5.561  7.679521
## 32  79.696203 12.112824    5.709  7.502521
## 33  82.202532 12.087840    5.858  7.328521
## 34  84.708861 12.066385    6.007  7.157521
## 35  87.215190 12.048160    6.155  6.991521
## 36  89.721519 12.032869    6.304  6.827521
## 37  92.227848 12.020214    6.453  6.665521
## 38  94.734177 12.009898    6.602  6.506521
## 39  97.240506 12.001625    6.750  6.350521
## 40  99.746835 11.995096    6.899  6.194521
## 41 102.253165 11.990015    7.048  6.040521
## 42 104.759494 11.986084    7.197  5.887521
## 43 107.265823 11.983005    7.345  5.736521
## 44 109.772152 11.980483    7.494  5.584521
## 45 112.278481 11.979925    7.643  5.435521
## 46 114.784810 11.983365    7.791  5.290521
## 47 117.291139 11.990402    7.940  5.148521
## 48 119.797468 12.000619    8.089  5.010521
## 49 122.303797 12.013596    8.238  4.874521
## 50 124.810127 12.028916    8.386  4.741521
## 51 127.316456 12.046161    8.535  4.609521
## 52 129.822785 12.064911    8.684  4.479521
## 53 132.329114 12.084750    8.833  4.350521
## 54 134.835443 12.105259    8.981  4.222521
## 55 137.341772 12.126019    9.130  4.094521
## 56 139.848101 12.146613    9.279  3.966521
## 57 142.354430 12.168721    9.428  3.839521
## 58 144.860759 12.194274    9.576  3.716521
## 59 147.367089 12.222381    9.725  3.595521
## 60 149.873418 12.252145    9.874  3.476521
## 61 152.379747 12.282669   10.022  3.359521
## 62 154.886076 12.313052   10.171  3.240521
## 63 157.392405 12.342399   10.320  3.120521
## 64 159.898734 12.369809   10.469  2.999521
## 65 162.405063 12.396275   10.617  2.877521
## 66 164.911392 12.423624   10.766  2.756521
## 67 167.417722 12.451767   10.915  2.635521
## 68 169.924051 12.480615   11.064  2.515521
## 69 172.430380 12.510075   11.212  2.396521
## 70 174.936709 12.540056   11.361  2.277521
## 71 177.443038 12.570466   11.510  2.158521
## 72 179.949367 12.601215   11.659  2.040521
## 73 182.455696 12.632210   11.807  1.923521
## 74 184.962025 12.663361   11.956  1.805521
## 75 187.468354 12.694575   12.105  1.688521
## 76 189.974684 12.725762   12.253  1.571521
## 77 192.481013 12.756830   12.402  1.453521
## 78 194.987342 12.787687   12.551  1.335521
## 79 197.493671 12.818243   12.700  1.216521
## 80 200.000000 12.848406   12.848  1.098521

This programmatically determines the most optimal parameters (rank and Jaccard length) for the dataset

colnames(jl.eval) <- c("JL", "Rank", "Slope")
#This will determine which JL curve gives the steepest slope for the cost values
max.slope.params <- jl.eval %>% filter(JL == jl.eval$JL[jl.eval$Slope == min(jl.eval$Slope)])
comnum <- com.df %>% filter(Jaccard_Length == max.slope.params$JL & Rank == max.slope.params$Rank)
params <- as.data.frame(cbind(comnum$Community_Number, max.slope.params$Rank, max.slope.params$JL))
colnames(params) <- c("Community_Number", "Rank", "Jaccard_Length")
  
optimized.community.params[[<dataset.name>]] <- params   
  
#Get the significant matches(indeces) from the Jaccard test
  
sig.labels <- as.data.frame(jaccardtest_results[[params$Rank]][[as.character(params$Jaccard_Length)]]$sigFwerJaccard)
sig.labels$row <- paste0("Split0_GEP", sig.labels$row)
sig.labels$col <- paste0("Split1_GEP", sig.labels$col)
com.labels[[<dataset.name>] <- sig.labels

saveRDS(optimized.community.params, file = "optimized_community_paramters.RDS")
saveRDS(com.labels, file = "community_labels.RDS")

Optimal parameters

optimized_community_parameters
##   Community_Number Rank Jaccard_Length
## 1               14   22             10

Significant pairs of replicated GEPs across the splits

com.labels
##             row          col
## 1  Split0_GEP10  Split1_GEP2
## 2  Split0_GEP13  Split1_GEP3
## 3   Split0_GEP2  Split1_GEP4
## 4  Split0_GEP14  Split1_GEP6
## 5  Split0_GEP15  Split1_GEP7
## 6   Split0_GEP4  Split1_GEP8
## 7  Split0_GEP21  Split1_GEP9
## 8  Split0_GEP16 Split1_GEP10
## 9   Split0_GEP8 Split1_GEP11
## 10  Split0_GEP1 Split1_GEP12
## 11 Split0_GEP12 Split1_GEP14
## 12  Split0_GEP5 Split1_GEP15
## 13  Split0_GEP6 Split1_GEP17
## 14  Split0_GEP9 Split1_GEP21