Towards modeling context-specific EMT regulatory networks using temporal single cell RNA-Seq data

Data preprocessing

We downloaded the rds files from the google drive link provided in the Cook et al. manuscript. The rds files contain the preprocessed data which we extracted and used for subsequent analysis. We separated the data from each treatment into two groups - forward transition (8 hour, Day 0,1,3,7) or reverse transition (Day 7, after removal - 8 hour, Day 1, 3). The data from day 7 were included in both datasets. We wrote the data to .tsv files and used those as the input file in SCENIC.

# Prepare data for processing

library("Seurat")
library(Seurat)
library(stringr)
library(Seurat)
library(jsonlite)
library(sjmisc)
library(MASS)
library(sRACIPE)
library(UpSetR)
library(ggplot2)

dir <- "/EmtRds" # Directory where the downloaded rds files are stored.
files <- list.files(dir)

for(i in seq_along(files)){
  filename <- paste0(dir,files[i])
  data <- readRDS(filename)
  fwd <- subset(data, Time %in% c("0d", "1d", "3d", "7d", "8h"))
  tmp <- as.matrix(Seurat::GetAssayData(fwd[["RNA"]], slot = "data"))
  write.table(tmp,row.names = TRUE,col.names = TRUE, sep = "\t", file = paste0(dir,"expression/",tools::file_path_sans_ext(files[i]),"_fwd.tsv"),quote = FALSE)
  rm(tmp,fwd)
  rev <- subset(data, Time %in% c("8h_rm", "1d_rm","3d_rm", "7d"))
  tmp <- as.matrix(Seurat::GetAssayData(rev[["RNA"]], slot = "data"))
  write.table(tmp,row.names = TRUE,col.names = TRUE, sep = "\t", file = paste0(dir,"expression/",tools::file_path_sans_ext(files[i]),"_rev.tsv"),quote = FALSE)
  
}

Running pySCENIC

We downloaded the “hg19 cis target databases” and “motif to tf annotation” from cisTarget databaes and used those in pySCENIC v0.9.19 alongwith the .tsv files generated above. We followed the pySCENIC full pipeline code for running pySCENIC. Our specific python script is given below.

if __name__ == '__main__':
import os
import glob
import pickle
import pandas as pd
import numpy as np
import json
import seaborn as sns
import sys

from dask.diagnostics import ProgressBar
from arboreto.utils import load_tf_names
from arboreto.algo import grnboost2
from pyscenic.rnkdb import FeatherRankingDatabase as RankingDatabase
from pyscenic.utils import modules_from_adjacencies, load_motifs
from pyscenic.prune import prune2df, df2regulons
from pyscenic.aucell import aucell



inputFilename = sys.argv[1] # Name of the specific data file provided as input through the shell script file
RESULT_FOLDER = "../results"
DATA_FOLDER = "../expression"
RESOURCES_FOLDER = "../resources"
DATABASE_FOLDER = "../databases/"
SCHEDULER = "123.122.8.24:8786"
DATABASES_GLOB = os.path.join(DATABASE_FOLDER, "hg19*.mc9nr.feather")
MOTIF_ANNOTATIONS_FNAME = os.path.join(DATABASE_FOLDER, "motifs-v9-nr.hgnc-m0.001-o0.0.tbl")
MM_TFS_FNAME = os.path.join(RESOURCES_FOLDER, 'hs_hgnc_curated_tfs.txt')
SC_EXP_FNAME = os.path.join(DATA_FOLDER, (inputFilename + ".tsv"))
MODULES_FNAME = os.path.join(RESULT_FOLDER, ("modules_" + inputFilename + ".p"))
REGULONS_FNAME = os.path.join(RESULT_FOLDER, "regulons_" + inputFilename + ".p")
MOTIFS_FNAME = os.path.join(RESULT_FOLDER, "motifs_" + inputFilename + ".csv")

ex_matrix = pd.read_csv(SC_EXP_FNAME, sep='\t', header=0, index_col=0).T
print(ex_matrix.shape)
tf_names = load_tf_names(MM_TFS_FNAME)
print("tf names loaded")
out_file = os.path.join(RESULT_FOLDER, "grn_output_" + inputFilename + ".tsv")
db_fnames = glob.glob(DATABASES_GLOB)


def name(fname):
return os.path.splitext(os.path.basename(fname))[0]


dbs = [RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames]
print(dbs)
print("running grnboost")
adjacencies = grnboost2(ex_matrix, tf_names=tf_names, verbose=True)
adjacencies.head()
print("identify modules")
adjacencies.to_csv(out_file, sep='\t', index=False, header=False)
print("grnboost done")
# adjacencies = pd.read_csv(out_file, sep='\t', names=["TF", "target", "importance"])
modules = list(modules_from_adjacencies(adjacencies, ex_matrix, rho_mask_dropouts=True))

print("writing modules")
with open(MODULES_FNAME, 'wb') as f:
pickle.dump(modules, f)
# print("reading modules")
# with open(MODULES_FNAME, 'rb') as f:
#     modules = pickle.load(f)

print("Finding Enriched modules")
# Calculate a list of enriched motifs and the corresponding target genes for all modules.

with ProgressBar():
df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME)
df.head()

# Create regulons from this table of enriched motifs.
print("creating regulons")
regulons = df2regulons(df)

print("writing regulons")
# Save the enriched motifs and the discovered regulons to disk.
df.to_csv(MOTIFS_FNAME)
with open(REGULONS_FNAME, "wb") as f:
pickle.dump(regulons, f)

print("Finding AUC of cells")
auc_mtx = aucell(ex_matrix, regulons, num_workers=1)
auc_file = os.path.join(RESULT_FOLDER, "AUC_" + inputFilename + ".csv")
auc_mtx.to_csv(auc_file)

# Write to JSON for import to R
rdict = {} 
for reg in regulons:
# print(reg)
targets = [ target for target in reg.gene2weight ]
rdict[reg.name] = targets


rjson = json.dumps(rdict)
jsonFile = os.path.join(RESULT_FOLDER, "regulons_" + inputFilename + ".json")
f = open(jsonFile,"w")
f.write(rjson)
f.close()

# Plot the results for visualization
sns_plot = sns.clustermap(auc_mtx, figsize=(8,8))
plotFile = os.path.join(RESULT_FOLDER, "AUC_" + inputFilename + ".png")
sns_plot.savefig(plotFile)

Processing SCENIC regulons and activities

The pySCENIC pipeline generated regulons and the TF activities which were used for subsequent analysis in R. First, we processed the calculated activities using Seurat and created Seurat objects containing the activities.

options(stringsAsFactors = FALSE)
## Network Construction
treatments <- c("EGF","TGFB1", "TNF")
cancer_types <- c("A549","DU145","MCF7","OVCA420")
direction <- c("fwd", "rev")

for(c in 1:length(cancer_types)) {
  ## Read in seurat object and regulon data
  #  c <- 2
  data_dir <- "EmtRds/" 
  seurat <- readRDS(paste0(data_dir,cancer_types[c],"_",t,".rds"))
  
  auc_fname <- paste0(data_dir,"/SCENIC/results/AUC_",cancer_types[c],"_",t,"_fwd.csv")
  auc_matrix <- read.table(auc_fname, header=TRUE, sep = ",", row.names = 1, check.names = FALSE)
  auc_matrix <- t(auc_matrix)
  seuratFwd <- subset(seurat, Time %in% c("0d", "1d", "3d", "7d", "8h"))
  auc_matrix <- auc_matrix[,colnames(seuratFwd)]
  seurat_regs <- CreateSeuratObject(counts = auc_matrix)
  seurat_regs <- AddMetaData(seurat_regs, metadata = FetchData(seurat, "Time"))
  Idents(seurat_regs) <- "Time"
  allGenes <- rownames(seurat_regs)
  
  seurat_regs <- ScaleData(seurat_regs, features = allGenes)
  seurat_regs <- FindVariableFeatures(seurat_regs)
  
  seurat_regs <- RunPCA(seurat_regs)
  seurat_regs <- RunUMAP(seurat_regs, dims = 1:50)
  ## Save Rds files
  saveRDS(seurat_regs, file = paste0(output_dir,"seurat_regs",cancer_types[c],"_",t,"_fwd.Rds"))
  
  auc_fname <- paste0(data_dir,"/SCENIC/results/AUC_",cancer_types[c],"_",t,"_rev.CSV")
  auc_matrix <- read.table(auc_fname, header=TRUE, sep = ",", row.names = 1, check.names = FALSE)
  auc_matrix <- t(auc_matrix)
  seuratRev <- subset(seurat, Time %in% c("1d_rm","3d_rm",  "7d", "8h_rm" ))
  auc_matrix <- auc_matrix[,colnames(seuratRev)]
  
  seurat_regs <- CreateSeuratObject(counts = auc_matrix)
  seurat_regs <- AddMetaData(seurat_regs, metadata = FetchData(seurat, "Time"))
  Idents(seurat_regs) <- "Time"
  allGenes <- rownames(seurat_regs)
  
  seurat_regs <- ScaleData(seurat_regs, features = allGenes)
  seurat_regs <- FindVariableFeatures(seurat_regs)
  
  seurat_regs <- RunPCA(seurat_regs)
  seurat_regs <- RunUMAP(seurat_regs, dims = 1:50)
  saveRDS(seurat_regs, file = paste0(output_dir,"seurat_regs",cancer_types[c],"_",t,"_rev.Rds"))
}

Differential Activity analysis

Next, we identified the differentially activated transcription factors (DATFs) between 0d and other time points in the forward transition and 7 days and other time points in the reverse direction.

options(stringsAsFactors = FALSE)
## Network Construction
treatments <- c("EGF","TGFB1", "TNF")
cancer_types <- c("A549","DU145","MCF7","OVCA420")
direction <- c("fwd", "rev")

counter <- 1
degs <- data.frame()
for(tCounter in seq_along(treatments)){
  for(cCounter in seq_along(cancer_types)){
    for(dCounter in seq_along(direction)){
      
      ## Select a treatment to work with (or iterate thru t in a loop)
      t <- treatments[tCounter]
      c <- cancer_types[cCounter]
      direc <- direction[dCounter]
      treatmentName <- paste0(c,"_",t,"_",direc)
      print(treatmentName)
      output_dir <- paste0("output/auc/")
      
      
      seurat_regs <- readRDS(paste0("output/auc/seurat_regs",c,"_",t,"_",direc,".Rds"))
      
      if(direc == "fwd"){
        for(degTime in c("8h", "1d", "3d", "7d")){
          deg <- FindMarkers(seurat_regs, ident.1 = degTime, ident.2 = "0d", logfc.threshold = 0.0)
          deg$CellLine <- c
          deg$gene <- rownames(deg)
          deg$Comparison <- paste0(c,"_",t,"_",direc)
          deg$Time <-  degTime
          degs <- rbind(degs,deg)
        }
      }
      if(direc == "rev"){
        for(degTime in c("8h_rm", "1d_rm", "3d_rm")){
          deg <- FindMarkers(seurat_regs, ident.1 = degTime, ident.2 = "7d", logfc.threshold = 0.0)
          deg$CellLine <- c
          deg$gene <- rownames(deg)
          deg$Comparison <- paste0(c,"_",t,"_",direc)
          deg$Time <-  degTime
          degs <- rbind(degs,deg)
        }
      }
    }
  }
}
saveRDS(degs, file = "degRegsFwd0dRev7d.rds")

Identifying significant DATFs and generating UpSet plot

Next, we looked at the overlap between the DATFs across various treatments. We used the top 100 DATFs from each comparison.

library(Seurat)
library(jsonlite)
library(sjmisc)
library(stringr)
library(MASS)
library(sRACIPE)
library(UpSetR)
library(ggplot2)

options(stringsAsFactors = FALSE)
## NEtwork Construction
treatments <- c("EGF","TGFB1", "TNF")
cancer_types <- c("A549","DU145","MCF7","OVCA420")
direction <- c("fwd", "rev")

counter <- 1
treatmentNames <- c()
for(tCounter in seq_along(treatments)){
  for(cCounter in seq_along(cancer_types)){
    t <- treatments[tCounter]
    c <- cancer_types[cCounter]
    treatmentNames <- c(treatmentNames,paste0(c,"_",t))
    
  }
}

degs <- readRDS(file = "output/network_construction/degRegsFwd0dRev7d.rds")
degs$Signal <- str_match(degs$Comparison, "_(.*?)_")[,2]


diffRegulonList <- list()
nDiffRegulon <- 100
set.seed(123)
for(tCounter in seq_along(treatments)){
  for(cCounter in seq_along(cancer_types)){
    for(dCounter in seq_along(direction)){
      t <- treatments[tCounter]
      c <- cancer_types[cCounter]
      direc <- direction[dCounter]
      treatmentName <- paste0(c,"_",t,"_",direc)
      print(treatmentName)
      output_dir <- paste0("output/auc/")
      
      deg_reg <- degs[degs$Comparison == paste0(c,"_",t,"_",direc),] 
      seurat_regs <- readRDS(paste0("output/auc/seurat_regs",c,"_",t,"_",direc,".Rds"))
      regulon_fname <- paste0("EmtRds/SCENIC/results/regulons_",c,"_",t,"_",direc,".json")
      regs <- read_json(regulon_fname, simplify = T)
      
      networkReg <- deg_reg$gene[rank(deg_reg$p_val_adj, ties.method = "random")]
      networkReg <- unique(networkReg)
      networkReg <- networkReg[1:min(nDiffRegulon,length(networkReg))]
      regs <- regs[networkReg]
      diffRegulonList[[treatmentName]]  <-  regs
    }
  }
}
saveRDS(diffRegulonList, file = "diffRegulonTop100Deg0d7d.rds")

diffRegulonNames <- lapply(diffRegulonList, function(x) names(x))
diffRegulonFwdRev <- list()
for(i in 1:(length(diffRegulonList)/2)){
  diffRegulonFwdRev[[i]] <- union(diffRegulonNames[[2*i-1]],diffRegulonNames[[2*i]])
}

names(diffRegulonFwdRev) <- treatmentNames
saveRDS(diffRegulonFwdRev, file = "diffRegulonFwdRev.rds")

diffRegulonFwdRev = diffRegulonFwdRev[order(names(diffRegulonFwdRev))]
upset(fromList(diffRegulonFwdRev),keep.order = F, group.by = "degree", text.scale = 2, nsets = length(diffRegulonFwdRev))

The DATFs were used to identify core TFs which occur most frequently.

library(dplyr)
degs <- readRDS(file = "degRegsFwd0dRev7d.rds")
degsFiltered <- degs[degs$p_val_adj < 0.05,]
TfFrequency <- degsFiltered %>%
  group_by(gene) %>%
  summarize(n())
hist(TfFrequency$`n()`, breaks = 20)
coreTf <- TfFrequency$gene[TfFrequency$`n()` > 23]
saveRDS(coreTf, file = "coreTf.rds")

Visualizing regulon activity across timepoints

To understand the phenotypic distribution of states in terms of regulon activity, we performed UMAP dimensional reduction on all conditions using activity data from all identified regulons. In order to have comparable values for the fwd timepoints (0d-7d after signal induction) and the rev timepoints (7d-3d_rm, denoting 3 days after signal interruption), we scaled the regulon activity for rev timepoints using a linear model such that the 7d data aligned for both directions. We then visualized the UMAP values on a density map with the centroids for each timepoint indicated by a black point.

### Setup
rm(list = ls())
library(Seurat)
library(jsonlite)
library(sjmisc)
library(stringr)
library(varhandle)
library(sRACIPE)
library(plyr)
library(ggplot2)
library(ggpmisc)
library(rowr)
library(spatstat.utils)
library(umap)
library(ggthemes)

## Initialize some values
treatments <- c("EGF","TGFB1", "TNF")
cancer_types <- c("A549","DU145","MCF7","OVCA420")
directions <- c("fwd","rev")
output_dir <- "output/cook/network_construction/dec1/"
data_dir <- "data/cook/"
scenic_data_dir <- "data/cook/SCENIC/results/"
network_dir <- "networks/vivekCode/cellLineNetworks/"
plotFName <- "figs/SI/revision/UMAP_density_alpha.pdf"
working_directory <- getwd()
cbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#E84855")

## Color setup for plots
rf <- colorRampPalette(rev(brewer.pal(11,'Spectral')))
plot_color <- rf(20)

## Make plot list
umap_density_list <- list()
iter <- 1

## Loop thru cell lines, treatments, directions
for(cancer in seq_along(cancer_types)) {
  c <- cancer_types[cancer]
  for(treatment in seq_along(treatments)) {
    t <- treatments[treatment]
    shared_regs <- c()
    print(paste0("Starting work on ",c,"_",t))
    for(dir in seq_along(directions)) {
      d <- directions[dir]
      if(d == "fwd") {
        seurat_regs_fwd <- readRDS(paste0("output/auc/seurat_regs",c,"_",t,"_",d,".Rds"))
        auc_mtx_fwd <- as.data.frame(t(GetAssayData(seurat_regs_fwd, slot = "scale.data")))
        auc_mtx_fwd$Time <- FetchData(seurat_regs_fwd, "Time")[,"Time"]
        auc_mtx_fwd$Direction <- d
        auc_mtx_fwd$Cell <- rownames(auc_mtx_fwd)
        
      } else {
        seurat_regs_rev <- readRDS(paste0("output/auc/seurat_regs",c,"_",t,"_",d,".Rds"))
        auc_mtx_rev <- as.data.frame(t(GetAssayData(seurat_regs_rev, slot = "scale.data")))
        auc_mtx_rev$Time <- FetchData(seurat_regs_rev, "Time")[,"Time"]
        auc_mtx_rev$Direction <- d
        auc_mtx_rev$Cell <- rownames(auc_mtx_rev)
        
      }
    }
    
    ## Scale using 7d data
    shared_regs <- colnames(auc_mtx_fwd)[which(colnames(auc_mtx_fwd) %in% colnames(auc_mtx_rev))]
    shared_regs_short <- shared_regs[-((length(shared_regs)-2):length(shared_regs))]
    auc_mtx_fwd_scale <- auc_mtx_fwd[,shared_regs]
    auc_mtx_rev_scale <- auc_mtx_rev[,shared_regs]
    for(reg in shared_regs_short) {
      fwd_7d_reg <- auc_mtx_fwd_scale[which(auc_mtx_fwd_scale$Direction == "fwd" & 
                                              auc_mtx_fwd_scale$Time == "7d"),reg]
      rev_7d_reg <- auc_mtx_rev_scale[which(auc_mtx_rev_scale$Direction == "rev"& 
                                              auc_mtx_rev_scale$Time == "7d"),reg]
      ## Generate linear models
      x_lm <- lm(fwd_7d_reg ~ rev_7d_reg)
      new_rev_reg <- (auc_mtx_rev_scale[which(auc_mtx_rev_scale$Direction == "rev" ),reg] * 
                      unname(x_lm$coefficients["rev_7d_reg"])) + 
        unname(x_lm$coefficients["(Intercept)"])
      
      ## Replace reverse data
      auc_mtx_rev_scale[,reg] <- new_rev_reg
      
    }
    
    ## Combine fwd and rev data
    auc_mtx_fwd_scale <- auc_mtx_fwd_scale[which(!auc_mtx_fwd_scale$Time == "7d"),]
    combined_mtx <- rbind(auc_mtx_fwd_scale, auc_mtx_rev_scale)
    na_regs <- colnames(combined_mtx[,colSums(is.na(combined_mtx)) != 0])
    combined_mtx_noNA <- combined_mtx[ , colSums(is.na(combined_mtx)) == 0]
    pca_regs <- shared_regs_short[which(shared_regs_short %in% colnames(combined_mtx_noNA))]
    combined_cellInfo <- combined_mtx[,c("Time","Direction",'Cell')]
    
    ## Progress report
    print(paste0("Finished scaling data for ",c,"_",t,". Starting dimensional reduction."))
    
    ## Run UMAP
    umap_rs <- umap(combined_mtx_noNA[,pca_regs])
    umap_data <- as.data.frame(umap_rs$layout)
    colnames(umap_data) <- c("x","y")
    if(all.equal(combined_cellInfo$Cell, rownames(umap_data))) {
      umap_data$Time <- combined_cellInfo$Time
    }
    umap_data$Time <- factor(umap_data$Time, levels = c("0d", "8h", "1d", "3d", "7d", "8h_rm", "1d_rm", "3d_rm"))
    
    ## Progress report
    print(paste0("Done with dimensional reductions for ",c,"_",t,". Creating plots now."))
    
    ## Calculate centroids for each timepoint
    centroid_df_umap <- aggregate(umap_data[,c("x","y")], list(Time=umap_data$Time), mean)
    
    ## Plot UMAP density with timepoint data
    ggscatter <- heatscatter(umap_data[,1], umap_data[,2], 
                xlab = "UMAP1", 
                ylab= "UMAP2", 
                ggplot = TRUE)
    image <- ggscatter + 
      geom_point(data=centroid_df_umap, mapping = aes(x=x, y=y, size=Time, alpha=Time)) +
      geom_path(data = centroid_df_umap, arrow = arrow(), linetype="dashed") +
      guides(alpha=FALSE, size=FALSE, color=FALSE, 
             shape=FALSE) +
      scale_shape_manual(values = c(15,16,17,18,8,21,24,23)) +
      scale_alpha_discrete(breaks=c(0.4,0.485,0.57,0.655,0.74,0.825,0.91,1)) +
      ggtitle(paste(c," ",t)) +
      theme(title = element_text(size=20),
            axis.title = element_text(size = 20),
            axis.text = element_text(size=18),
            plot.subtitle = element_text(size=16),
            legend.title = element_text(size=20),
            legend.title.align = 0,
            legend.text = element_text(size=18),
            legend.key.size = unit(20, units = "pt")) +
      labs(subtitle = paste0("UMAP on ",length(pca_regs), " regulons in ", nrow(umap_data), " cells"))
    if(t == "TNF") {
      image <- image + 
        guides(size=guide_legend(title = "Time"), 
               color=guide_legend(override.aes = list(size = 3.5)), 
               shape=guide_legend(override.aes = list(size = 3.5)))
    }
    umap_density_list[[iter]] <- image

    
    ## Increase iterator
    iter <- iter+1
  }
}

## Save all plots
pdf(file = plotFName, width=11.5, height=8)
for(plot in umap_density_list) {
  print(plot, newpage = TRUE)
}
graphics.off()

To understand how different regulons participate in different phases of the transition, we visualized the activity of 28 highly conserved regulons over time in each experimental condition.

### Setup
rm(list = ls())
library(Seurat)
library(jsonlite)
library(sjmisc)
library(stringr)
library(varhandle)
library(sRACIPE)
library(plyr)
library(ggplot2)
library(ggpmisc)
library(rowr)
library(spatstat.utils)
library(umap)
library(ggthemes)

## ggplot theme for graphics
ggtheme <- function (base_size = 11, base_family = "") {
  theme_get() %+replace%
    theme(axis.title = element_text(size = 16),
          axis.text = element_text(size=14),
          axis.line = element_line(color = "black", linetype = "solid"),
          title = element_text(size = 16),
          legend.title = element_text(size=16),
          legend.title.align = 0,
          legend.text = element_text(size=14),
          legend.key.size = unit(20, units = "pt"),
          panel.grid = element_blank(),
          panel.background = element_blank())
}

# Multiple plot function from http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)
  
  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)
  
  numPlots = length(plots)
  
  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }
  
  if (numPlots==1) {
    print(plots[[1]])
    
  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    
    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

## Initialize some values
treatments <- c("EGF","TGFB1", "TNF")
cancer_types <- c("A549","DU145","MCF7","OVCA420")
directions <- c("fwd","rev")
output_dir <- "output/cook/network_construction/dec1/"
data_dir <- "data/cook/"
scenic_data_dir <- "data/cook/SCENIC/results/"
network_dir <- "networks/vivekCode/cellLineNetworks/"
working_directory <- getwd()
cbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#E84855")


## Get reg list
reg_list <- c("BHLHE40(+)", "CEBPB(+)", "CREB3(+)", "CREM(+)", "ELK3(+)",
              'ESRRA(+)', 'ETS1(+)', 'ETS2(+)', 'FOS(+)', 'FOSB(+)',
              'FOSL1(+)', 'IRF1(+)', 'IRF2(+)', 'IRF3(+)', 'IRF7(+)', 
              'JUN(+)', 'JUNB(+)', 'KLF6(+)', 'MAFG(+)', 'MYC(+)', 
              'NFE2L3(+)', 'NFKB2(+)', 'RELB(+)', 'SOX4(+)', 'SPDEF(+)',
              'STAT1(+)', 'STAT3(+)', 'XBP1(+)')
gene_list <- reg_list

## make dummy plot for null cases
dummy_df <- data.frame(x=1,y=1)
dummy_plot <- ggplot(data = dummy_df) +
  geom_point(aes(x=x, y=y, alpha=0.0001)) +
  guides(alpha=FALSE) +
  ggtheme() +
  geom_text(mapping = aes(x=1, y=1, label="NULL"))

for(g in seq_along(gene_list)) {
  gene_plotlist <- list()
  iter <- 1
  for(c in cancer_types) {
    for(t in treatments) {
      gene_df_full <- data.frame(Activity=numeric(), Time=character(),Direction=character())
      for(d in directions) {
        ## Read auc matrix, seurat object, de regs
        treatmentName <- paste0(c,"_",t,"_",d)
        seurat_regs <- readRDS(paste0("output/auc/seurat_regs",c,"_",t,"_",d,".Rds"))
        auc_mtx <- GetAssayData(seurat_regs, slot = "scale.data")
        
        ## Select gene of interest
        if(gene_list[g] %in% rownames(auc_mtx)) {
          gene_df <- as.data.frame(auc_mtx[gene_list[g],])
          colnames(gene_df)[1] <- "Activity"
          gene_df$Time <- FetchData(seurat_regs, "Time")[,"Time"]
          gene_df$Direction <- d
          gene_df_full <- rbind(gene_df_full, gene_df)
        }
      }
      ## Only proceed if there is data for the regulon - also correct color scheme if data for one direction is missing
      if(nrow(gene_df_full) > 0) {        
        gene_df_full$Time <- factor(gene_df_full$Time, levels = c("0d", "8h", "1d", "3d", "7d", "8h_rm", "1d_rm", "3d_rm"))
        
        if(length(unique(gene_df_full$Direction)) == 1) {
          if(unique(gene_df_full$Direction) == "fwd") {
            plotColors <- cbPalette
          } else {
            plotColors <- cbPalette[5:8]
          }
        } else {
          plotColors <- cbPalette
        }
        
        ## Plot
        image <- ggplot(gene_df_full, aes(x=Time,y=Activity, fill=Time)) +
          geom_boxplot() +
          ggtheme() +
          ylab(paste0(gene_list[g])) +
          scale_fill_manual(values=plotColors) +
          guides(fill=FALSE) +
          facet_wrap(~Direction, scales = "free") +
          theme(axis.text.y = element_text(size = 14), 
                title = element_text(size = 20), 
                axis.title.y = element_text(size = 16),
                legend.text = element_text(size = 12),
                legend.title = element_text(size = 14),
                strip.text = element_text(size = 16),
                axis.text.x = element_blank(),
                axis.title.x = element_blank()) +
          ggtitle(paste0(c," ", t))
        
        if(t == "TNF") {
          image <- image +
            theme(axis.text.x = element_text(size = 16, angle = 90), 
                  axis.title.x = element_text(size=20))
        }
        gene_plotlist[[iter]] <- image
        iter <- iter+1
      } 
      else {
        gene_plotlist[[iter]] <- dummy_plot
        iter <- iter+1
        
      }
    }
  }
  
  ## Save plot for this regulon
  title <- paste0("figs/fig3/",gene_list[g],"_time_activity.pdf")
  pdf(file = title, width = 22, height = 17)
  multiplot(plotlist = gene_plotlist, cols = 4)
  dev.off()

}

Using three specific combinations of regulons known to be highly involved in the propagation of the EMT-inducing signals, specifically 1) RELB+NFKB2 vs FOSL1+JUNB, 2) FOSL1 vs RELB, and 3) JUNB vs RELB, we then visualized the average activity of these TFs over time for each condition. As in the earlier UMAP density plots, the regulon activity data were scaled on a linear model such that the

### Setup
rm(list = ls())
library(Seurat)
library(jsonlite)
library(sjmisc)
library(stringr)
library(varhandle)
library(sRACIPE)
library(plyr)
library(ggplot2)
library(ggpmisc)
library(rowr)
library(spatstat.utils)
library(umap)
library(ggthemes)
library(RColorBrewer)

## ggplot theme for graphics
ggtheme <- function (base_size = 11, base_family = "") {
  theme_get() %+replace%
    theme(axis.title = element_text(size = 18),
          axis.text = element_text(size=16),
          axis.line = element_line(color = "black", linetype = "solid"),
          title = element_text(size = 20),
          legend.title = element_text(size=18),
          legend.title.align = 0,
          legend.text = element_text(size=16),
          legend.key.size = unit(20, units = "pt"),
          panel.grid = element_blank(),
          panel.background = element_blank())
}

# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)
  
  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)
  
  numPlots = length(plots)
  
  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }
  
  if (numPlots==1) {
    print(plots[[1]])
    
  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    
    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}


## Initialize some values
treatments <- c("EGF","TGFB1", "TNF")
cancer_types <- c("A549","DU145","MCF7","OVCA420")
directions <- c("fwd","rev")
output_dir <- "output/cook/network_construction/dec1/"
data_dir <- "data/cook/"
scenic_data_dir <- "data/cook/SCENIC/results/"
network_dir <- "networks/vivekCode/cellLineNetworks/"
working_directory <- getwd()
cbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#E84855")

## Parameters
genepair_lists <- list(c("RELB+NFKB2","FOSL1+JUNB"),c("FOSL1","RELB"),c("JUNB","RELB"))

## make dummy plot for null cases
dummy_df <- data.frame(x=1,y=1)
dummy_plot <- ggplot(data = dummy_df) +
  geom_point(aes(x=x, y=y, alpha=0.0001)) +
  guides(alpha=FALSE) +
  ggtheme() +
  geom_text(mapping = aes(x=1, y=1, label="NULL"))

for(genes in genepair_lists) {
  trans_centroid_plotlist <- list()
  iter <- 1
  
  g1_expanded <- strsplit(genes[1],"\\+")[[1]]
  g2_expanded <- strsplit(genes[2],"\\+")[[1]]
  full_genelist <- c(g1_expanded, g2_expanded)
  
  regs_g1 <-paste0(g1_expanded, "(+)") 
  regs_g2 <-paste0(g2_expanded, "(+)") 
  regs <- paste0(full_genelist, "(+)")
  
  ## Loop thru cancer types and treatments (12 conditions)
  for(c in cancer_types) {
    for(t in treatments) {
      activity_df_full <- data.frame(x=numeric(),y=numeric(),Time=character(),Direction=character())
      # Combine data for fwd and rev
      for(d in directions) {
        seurat_regs <- readRDS(paste0("output/auc/seurat_regs",c,"_",t,"_",d,".Rds"))
        if(length(regs[which(regs %in% rownames(seurat_regs))]) == length(regs)) {
          auc_mtx <- as.data.frame(t(GetAssayData(seurat_regs, slot = "scale.data")[regs,]))
          if(length(regs) == 2) {
            colnames(auc_mtx) <- c("x","y")
            auc_mtx$Time <- FetchData(seurat_regs, "Time")[,"Time"]
            auc_mtx$Direction <- d
          } 
          else if(length(regs) == 4){
            auc_mtx$x <- rowSums(auc_mtx[,c(regs_g1)], dims = 1)
            auc_mtx$y <- rowSums(auc_mtx[,c(regs_g2)], dims = 1)
            auc_mtx <- auc_mtx[,c("x","y")]
            auc_mtx$Time <- FetchData(seurat_regs, "Time")[,"Time"]
            auc_mtx$Direction <- d
          } 
          else {
            auc_mtx <- data.frame()
          }
          
          
        } else if(c == "MCF7" & t == "EGF") {
          ## Handle an exception case where data is missing for one regulon
          auc_mtx <- as.data.frame(t(GetAssayData(seurat_regs, slot = "scale.data")[c("RELB(+)",regs_g2),]))
          auc_mtx$x <- auc_mtx$`RELB(+)`
          auc_mtx$y <- rowSums(auc_mtx[,c(regs_g2)], dims = 1)
          auc_mtx <- auc_mtx[,c("x","y")]
          auc_mtx$Time <- FetchData(seurat_regs, "Time")[,"Time"]
          auc_mtx$Direction <- d
        }
        else {
          auc_mtx <- data.frame()
        }
        activity_df_full <- rbind(activity_df_full, auc_mtx)
        
      }
      ## If data for both directions is present, scale rev data to match fwd
      if(nrow(activity_df_full) > 0 & length(unique(activity_df_full$Direction)) == 2) {  
        ## Set colors 
        activity_df_full$Time <- factor(activity_df_full$Time, levels = c("0d", "8h", "1d", "3d", "7d", "8h_rm", "1d_rm", "3d_rm"))
        plotColors <- cbPalette
        
        
        #### Scaled plots
        if(length(unique(activity_df_full$Direction)) == 2) {
          fwd_7d_x <- activity_df_full[which(activity_df_full$Direction == "fwd" & activity_df_full$Time == "7d"),"x"]
          fwd_7d_y <- activity_df_full[which(activity_df_full$Direction == "fwd"& activity_df_full$Time == "7d"),"y"]
          
          rev_7d_x <- activity_df_full[which(activity_df_full$Direction == "rev"& activity_df_full$Time == "7d"),"x"]
          rev_7d_y <- activity_df_full[which(activity_df_full$Direction == "rev"& activity_df_full$Time == "7d"),"y"]
          
          ## Generate linear models
          x_lm <- lm(fwd_7d_x ~ rev_7d_x)
          y_lm <- lm(fwd_7d_y ~ rev_7d_y)
          
          ## Rescale rev data
          new_rev_x <- (activity_df_full[which(activity_df_full$Direction == "rev" ),"x"] * 
                          unname(x_lm$coefficients["rev_7d_x"])) + 
            unname(x_lm$coefficients["(Intercept)"])
          new_rev_y <- (activity_df_full[which(activity_df_full$Direction == "rev" ),"y"] * 
                          unname(y_lm$coefficients["rev_7d_y"])) + 
            unname(y_lm$coefficients["(Intercept)"])
          
          ## Add to df
          activity_df_full$transformed_x <- activity_df_full$x
          activity_df_full[which(activity_df_full$Direction == "rev"),"transformed_x"] <- new_rev_x
          
          activity_df_full$transformed_y <- activity_df_full$y
          activity_df_full[which(activity_df_full$Direction == "rev"),"transformed_y"] <- new_rev_y
        }
        
        
        ## Make centroid df
        centroid_df <- aggregate(activity_df_full[,c("transformed_x","transformed_y")], list(Time=activity_df_full$Time), mean)
        
        ## Plot centroids
        rf <- colorRampPalette(rev(brewer.pal(11,'Spectral')))
        plot_color <- rf(20)
        image <- ggplot(centroid_df, aes(x=transformed_x,y=transformed_y)) +
          geom_point(mapping=aes(size=1,  color=Time)) +
          geom_path(aes(alpha=0.5), arrow = arrow(), linetype="dashed") +
          xlab(genes[1]) +
          ylab(genes[2]) +
          scale_size(range=c(1.75, 3)) +
          scale_shape_manual(values=c(16,17,3,4,8,18,15,13)) +
          guides(alpha=FALSE, size=FALSE, color=FALSE) + 
          scale_color_manual(values=plot_color) +
          ggtheme() +
          ggtitle(paste(c," ",t))
        trans_centroid_plotlist[[iter]] <- image
        iter <- iter+1
        
        
      } 
      else {
        trans_centroid_plotlist[[iter]] <- dummy_plot
        iter <- iter+1
      }
      
      
    }
  }

  title <- paste0(output_dir,genes[1],"_",genes[2],"_activity_centroids_SCALED.pdf")
  pdf(file = title, width = 22, height = 17)
  multiplot(plotlist = trans_centroid_plotlist, cols = 4)
  dev.off()
}

Network construction

We used the DATF regulons to identify Tf-target pairs for all the experimental conditions.

allRegulons <- list()
for(tCounter in seq_along(treatments)){
  for(cCounter in seq_along(cancer_types)){
    for(dCounter in seq_along(direction)){
      
      ## Select a treatment to work with (or iterate thru t in a loop)
      t <- treatments[tCounter]
      c <- cancer_types[cCounter]
      direc <- direction[dCounter]
      treatmentName <- paste0(c,"_",t,"_",direc)
      output_dir <- paste0("output/auc/")
      
      regulon_fname <- paste0("EmtRds/SCENIC/results/regulons_",c,"_",t,"_",direc,".json")
      regs <- read_json(regulon_fname, simplify = T)
      network <- data.frame(Source = character(), Target = character())
      for(src in seq_along(regs)){
        targets <- unlist(regs[src])
        networkTmp <- data.frame(Source = rep(names(regs)[src],length(targets)), Target = targets)
        network <- rbind(network, networkTmp)
      }
      network$Dataset <- treatmentName
      allRegulons[[treatmentName]] <- network
      
    }
  }
}

saveRDS(allRegulons, file = "allRegulons.rds")

Identifying network interactions and their weigths

First we defined a function to calculate mutual information matrix using the activity matrix.

library(infotheo)
#' @param data expression matrix. Genes in rows and samples/cells in columns
#' @param nbins number of bins
#' @value A mutual information matrix
calculateMI <- function(data = actMat){
  data <- t(data)
  nGenes <- dim(data)[2]
  miMat <- matrix(0,nrow = nGenes,ncol = nGenes)
  geneNames <- colnames(data)
  rownames(miMat) <- geneNames
  colnames(miMat) <- geneNames
  data <- infotheo::discretize(data)
  #i=1
  for(i in 1:(nGenes-1))
  {
    for(j in (i+1):(nGenes))
    {
      gene1 <- geneNames[i]
      gene2 <- geneNames[j]
      miMat[gene1, gene2] <- infotheo::mutinformation(data[,gene1], data[,gene2], method = "mm")
      miMat[gene2, gene1] <- miMat[gene1, gene2]
    }
  }
  return(miMat)
}

Next, we identified the networks for each treatment using the regulons from SCENIC and added the mutual information and correlation for each interaction.

allRegulons <- readRDS("allRegulons.rds")
coreTf <- readRDS("coreTf.rds")
fullNetworkList <- list()
counter <- 1
for(tCounter in seq_along(treatments)){
  for(cCounter in seq_along(cancer_types)){
    for(dCounter in seq_along(direction)){
      t <- treatments[tCounter]
      c <- cancer_types[cCounter]
      direc <- direction[dCounter]
      treatmentName <- paste0(c,"_",t,"_",direc)
      print(treatmentName)
      tfInNetwork <- coreTf
      tfInNetwork <- substr(tfInNetwork, 0, nchar(as.character(tfInNetwork)) -3)
      geneNetwork <- data.frame(allRegulons[treatmentName])
      colnames(geneNetwork) <- c("Source", "Target", "Dataset")
      geneNetwork$Source <- substr(geneNetwork$Source, 0, nchar(as.character(geneNetwork$Source)) -3)
      
      # Include any interactions where TFs from core network are source or targets
      coreInteractions <- geneNetwork[which(geneNetwork$Source %in% tfInNetwork),]
      coreInteractionsTmp <- geneNetwork[which(geneNetwork$Target %in% tfInNetwork),]
      
      coreInteractions <- rbind(coreInteractions, coreInteractionsTmp)
      colnames(coreInteractions) <- c("Source", "Target", "Dataset")
      
      coreInteractions <- coreInteractions[!duplicated(coreInteractions),]
      
      seurat_regs <- readRDS(paste0("output/auc/seurat_regs",c,"_",t,"_",direc,".Rds"))
      networkGenes <- union(coreInteractions$Source, coreInteractions$Target)
      tmp <- seurat_regs@assays$RNA@scale.data
      regNames <- substr(rownames(tmp), 0, nchar(as.character(rownames(tmp))) -3)
      networkGenes <- networkGenes[which(networkGenes %in% regNames)]
      coreInteractions <- coreInteractions[which(coreInteractions$Source %in% networkGenes),]
      coreInteractions <- coreInteractions[which(coreInteractions$Target %in% networkGenes),]
      networkGenes <- union(coreInteractions$Source, coreInteractions$Target)
      rownames(tmp) <- regNames
      tmp <- tmp[which(rownames(tmp) %in% networkGenes),]
      tmp <- t(tmp)
      actMi <- calculateMI(actMat = tmp)
      
      mi <- integer(length = length(coreInteractions$Source))
      for(int in seq_along(coreInteractions$Source)){
        mi[int] <- actMi[as.character(coreInteractions$Source[int]), as.character(coreInteractions$Target[int])]
      }
      coreInteractions$Mi <- mi
      
      actCor <- cor(tmp, method = "s")
      type <- integer(length = length(coreInteractions$Source))
      for(int in seq_along(coreInteractions$Source)){
        type[int] <- actCor[as.character(coreInteractions$Source[int]), as.character(coreInteractions$Target[int])]
      }
      coreInteractions$Cor <- type
      type[type>0] <- 1
      type[type<=0] <- 2
      coreInteractions$Type <- type
      coreInteractions$Dataset <- treatmentName
      #  }
      fullNetworkList[[treatmentName]] <- coreInteractions
      counter <- counter+1
    }
  }
}
saveRDS(fullNetworkList, file = "fullNetworkList_top23RegAllRegAllInt.rds")

srcGenes <- lapply(fullNetworkList, function(x)x$Source)
tgtGenes <- lapply(fullNetworkList, function(x)x$Target)
dataset <- lapply(fullNetworkList, function(x)x$Dataset)
types <- lapply(fullNetworkList, function(x)x$Type)
actCor <- lapply(fullNetworkList, function(x)x$Cor)
mi <- lapply(fullNetworkList, function(x)x$Mi)
cellLine <- str_match(unlist(dataset), "(.*?)_")[,2]
signal <- str_match(unlist(dataset), "_(.*?)_")[,2]

allNetworkFull <- data.frame(Source = unlist(srcGenes), Target = unlist(tgtGenes), Type = unlist(types), Dataset = unlist(dataset), Cor = unlist(actCor), Mi = unlist(mi), CellLine = cellLine,Signal = signal, stringsAsFactors = FALSE)
saveRDS(allNetworkFull, "allNetworkFullCoreMiTop23AllRegAllInt.rds")

Create and simulate threshold specific networks

We defined a function to filter out interactions and simulated the network.

library(sRACIPE)
library(stringr)

createNetworkMiValue <- function(tmpNetwork=tmpNetwork,posMi = posMi,
                                 negMi = negMi, name = "A549", simulate = FALSE){
  tmpNetwork <- tmpNetwork[order(tmpNetwork$Mi, decreasing = TRUE),]
  tmpNetwork <- tmpNetwork[!duplicated(tmpNetwork[c("Source","Target","Type")]),]
  posNetwork <- tmpNetwork[tmpNetwork$Cor >0,]
  posNetwork <- posNetwork[which(posNetwork$Mi > posMi),]
  negNetwork <- tmpNetwork[tmpNetwork$Cor < 0,]
  negNetwork <- negNetwork[which(negNetwork$Mi > negMi),]
  if(dim(posNetwork)[1] == 0 | dim(negNetwork)[1] == 0) return()
  
  tmpNetwork <- rbind(posNetwork,negNetwork)
  sum(duplicated(tmpNetwork[,c("Source","Target")]))
  sum(duplicated(tmpNetwork[,c("Source","Target","Type")]))
  tmpNetwork <- tmpNetwork[!duplicated(tmpNetwork[,c("Source","Target","Type")]),]
  tmpNetwork[duplicated(tmpNetwork[,c("Source","Target")]),]
  tmpNetwork[duplicated(tmpNetwork[,c("Source","Target")], fromLast = TRUE),]
  delRows <- anyDuplicated(tmpNetwork[,c("Source","Target")])
  delRows <- c(delRows, anyDuplicated(tmpNetwork[,c("Source","Target")], fromLast = TRUE))
  delRows <- delRows[delRows>0]
  if(length(delRows>0)){ tmpNetwork <- tmpNetwork[-delRows,]}
  
  interactionRemoved = length(tmpNetwork$Source)
  # If signal nodes are to be removed iteratively, uncomment the next line
  # while(interactionRemoved>0)
  {
    tmpVar = length(tmpNetwork$Source)
    
    tmpNetwork <- tmpNetwork[(which(tmpNetwork$Source %in% tmpNetwork$Target )),]
    #     tmpNetwork <- tmpNetwork[which(tmpNetwork$Target %in% tmpNetwork$Source),]
    interactionRemoved = tmpVar - length(tmpNetwork$Source)
  }
  
  tmpNetwork <- tmpNetwork[!duplicated(tmpNetwork),]
  message(length(tmpNetwork$Source))
  networkTfs <- union(tmpNetwork$Source,tmpNetwork$Target)
  coreTfGenes <- substr(coreTf,0,nchar(coreTf)-3)
  print(networkTfs[which(networkTfs %in% coreTfGenes)])
  
  if(length(union(tmpNetwork$Source,tmpNetwork$Target)) <5) simulate = FALSE
  #  if(length(which(tmpNetwork$Type ==2)) <2) simulate = FALSE
  #  if(length(which(tmpNetwork$Type ==1)) <2) simulate = FALSE
  
  if(simulate){
    tmp <- sracipeSimulate(circuit = tmpNetwork[,c("Source", "Target", "Type")], plotToFile = TRUE,
                           numModels = 2000, plots = FALSE, stepper = "RK4", integrateStepSize = 0.02)
    annotation(tmp) <- paste0(name,"_",as.character(posMi), "_",as.character(negMi),"Interactions")
    # tmp <- sracipePlotData(tmp, plotToFile = T)
    saveRDS(tmp, file = paste0("simAll/",name,"_",as.character(posMi), "_",as.character(negMi),".rds"))
  }
  # return(tmpNetwork)
}

The simulated data can be plotted and visualized.

tmp <- readRDS(paste0("simAll/","A549_TNF_0.2_0.3.rds"))
tmp <- sracipePlotData(tmp)

Next we varied the positive and negative mutual information cutoffs to simulate a large number of networks.

coreTf <- readRDS(("coreTf.rds"))
allNetworkFull <- readRDS("allNetworkFullCoreMiTop23AllRegAllInt.rds")

for(posMi in seq(1,0.05,-0.05)){
  for(negMi in seq(0.5,0.05,-0.05)){
    cellLine = "DU145"
    treatment <- "TNF"
    print(paste0(cellLine,"_",treatment,"_",as.character(posMi),"_",as.character(negMi)))
    tmpNetwork <- allNetworkFull[(allNetworkFull$CellLine == cellLine & allNetworkFull$Signal == treatment),]
    
    createNetworkMiValue(tmpNetwork,posMi=posMi,negMi = negMi,
                         name=paste0(cellLine,"_",treatment), simulate = TRUE)
  }
}

Epithelial/Mesenchymal classification of DATFs

library(igraph)

options(stringsAsFactors = FALSE)

allNetworkFull <- readRDS("allNetworkFullCoreMiTop23AllRegAllInt.rds")
allGenes <- union(allNetworkFull$Source,allNetworkFull$Target)
degs <- readRDS(file = "output/network_construction/degRegsFwd0dRev7d.rds")
degs$Signal <- str_match(degs$Comparison, "_(.*?)_")[,2]


counter <- 1
treatmentNames <- c()
for(tCounter in seq_along(treatments)){
  for(cCounter in seq_along(cancer_types)){
    for(dCounter in seq_along(direction)){
      t <- treatments[tCounter]
      c <- cancer_types[cCounter]
      direc <- direction[dCounter]
      treatmentNames <- c(treatmentNames,paste0(c,"_",t,"_",direc))
      
    }
  }
}

EMClass <- matrix(data = NA,ncol = length(treatmentNames),nrow = length(allGenes),dimnames = list(allGenes, treatmentNames))

for(tCounter in seq_along(treatmentNames)){
  #  tCounter = 1
  degTmp <- degs[which((degs$Comparison == treatmentNames[tCounter]) & (degs$Time == "7d") & (degs$p_val_adj < 0.05)),]
  Mgenes <- degTmp$gene[which(degTmp$avg_logFC>0)]
  Mgenes <- substr(Mgenes,0,nchar(Mgenes)-3)
  Mgenes <- Mgenes[which(Mgenes %in% allGenes)]
  Egenes <- degTmp$gene[which(degTmp$avg_logFC<0)]
  Egenes <- substr(Egenes,0,nchar(Egenes)-3)
  Egenes <- Egenes[which(Egenes %in% allGenes)]
  EMClass[Egenes,treatmentNames[tCounter]] <- FALSE
  EMClass[Mgenes,treatmentNames[tCounter]] <- TRUE
  
  degTmp <- degs[which((degs$Comparison == treatmentNames[tCounter]) & (degs$Time == "3d_rm") & (degs$p_val_adj < 0.05)),]
  Mgenes <- degTmp$gene[which(degTmp$avg_logFC<0)]
  Mgenes <- substr(Mgenes,0,nchar(Mgenes)-3)
  Mgenes <- Mgenes[which(Mgenes %in% allGenes)]
  Egenes <- degTmp$gene[which(degTmp$avg_logFC>0)]
  Egenes <- substr(Egenes,0,nchar(Egenes)-3)
  Egenes <- Egenes[which(Egenes %in% allGenes)]
  EMClass[Egenes,treatmentNames[tCounter]] <- FALSE
  EMClass[Mgenes,treatmentNames[tCounter]] <- TRUE
}
saveRDS(EMClass, file = "EMClassP05.rds")

coreGenes <- substr(coreTf, 0, nchar(coreTf)-3)

counter <- 1
conditionNames <- c()
for(tCounter in seq_along(treatments)){
  for(cCounter in seq_along(cancer_types)){
    t <- treatments[tCounter]
    c <- cancer_types[cCounter]
    conditionNames <- c(conditionNames,paste0(c,"_",t))
  }
}


EMClassCondition <- matrix(data = NA,ncol = length(conditionNames),nrow = length(allGenes),dimnames = list(allGenes, conditionNames))
for(tCounter in seq_along(conditionNames)){
  for(geneCounter in seq_along(allGenes)){
    if(is.na(EMClass[allGenes[geneCounter],2*tCounter-1])){
      if(!is.na(EMClass[allGenes[geneCounter],2*tCounter])){
        EMClassCondition[allGenes[geneCounter],conditionNames[tCounter]] <- EMClass[allGenes[geneCounter],2*tCounter]
      }
    }
    if(!is.na(EMClass[allGenes[geneCounter],2*tCounter-1])){
      if(is.na(EMClass[allGenes[geneCounter],2*tCounter])){
        EMClassCondition[allGenes[geneCounter],conditionNames[tCounter]] <- EMClass[allGenes[geneCounter],2*tCounter-1]
      }
      if(!is.na(EMClass[allGenes[geneCounter],2*tCounter])){
        if(EMClass[allGenes[geneCounter],2*tCounter-1] == EMClass[allGenes[geneCounter],2*tCounter]){
          EMClassCondition[allGenes[geneCounter],conditionNames[tCounter]] <- EMClass[allGenes[geneCounter],2*tCounter-1]}
        if(!(EMClass[allGenes[geneCounter],2*tCounter-1] == EMClass[allGenes[geneCounter],2*tCounter])){
          EMClassCondition[allGenes[geneCounter],conditionNames[tCounter]] <- NA}
      }
    }
  }
}
saveRDS(EMClassCondition, "EMClassCondition.rds")

Network Metrics

EMClassCondition <- readRDS("EMClassCondition.rds")

fileNames <- list.files("simAll/")
networkMetrics <- data.frame(Nodes=integer(),Interactions=integer(),PosInt = integer(),EModels = integer(), MModels = integer(),Connected = logical(),MGenes = integer(), Egenes = integer(), ModelCount = integer(), CellLine = character(), Signal = character(), Dataset = character(), transitivity = numeric(), MeanDistance = numeric(), Accuracy = numeric(), Filename = character(), MiPos = numeric(), MiNeg = numeric())

hammingTable <- c(0,7,12,17,22,28,35,42,49,56,64,71,79)
for(fileCounter in seq_along(fileNames)){
  
  fileNameTmp <- fileNames[fileCounter]
  print(fileNameTmp)
  
  cellLine <- str_match(fileNameTmp, "(.*?)_")[,2]
  signal <- str_match(fileNameTmp, "_(.*?)_")[,2]
  posMi <- as.numeric(gsub("^(?:[^_]+_){2}([^_]+).*", "\\1", fileNameTmp))
  negMi <- gsub("^(?:[^_]+_){3}([^_]+).*", "\\1", fileNameTmp)
  negMi <- as.numeric(substr(negMi,0,nchar(negMi)-4))
  
  condition <- paste0(cellLine, "_",signal)
  
  racipe <- readRDS(paste0("simAll/",fileNameTmp))
  # sracipePlotCircuit(racipe,plotToFile = FALSE)
  racipe <- sracipeNormalize(racipe)
  expData <- assay(racipe,1)
  
  EMGenes <- EMClassCondition[,condition]
  MState <- EMGenes[!is.na(EMGenes)]
  circuit <-  as.data.frame(sracipeCircuit(racipe))
  g <- graph_from_data_frame(circuit, directed = TRUE, vertices = NULL)

  commonGenes <- rownames(expData)
  commonGenes <- commonGenes[which(commonGenes %in% names(MState))]
  expData <- expData[commonGenes,]
  expData <- Binarize::binarizeMatrix(expData, method = "kMeans")[,1:dim(expData)[2]]
  
  MState <- MState[commonGenes]
  EState <- !MState
  expDataTmp <- as.matrix(expData)
  storage.mode(expDataTmp) <- "logical"
  if(length(MState)>85) message("Extend hamming table cutoffs")
  hammingCutoff <- findInterval(length(MState),hammingTable)
  hamDistE <- apply((expDataTmp), 2, function(x) sum(x != EState))
  hamDistM <- apply((expDataTmp), 2, function(x) sum(x != MState))
  networkMetricsTmp <- data.frame(Nodes= length(union(circuit$Source,circuit$Target)),Interactions=length(circuit$Source),PosInt = length(which(circuit$Type ==1)),EModels = length(which(hamDistE < hammingCutoff)), MModels = length(which(hamDistM < hammingCutoff)),Connected = igraph::is_connected(g),MGenes = sum(MState), Egenes = sum(EState), ModelCount = sracipeConfig(racipe)$simParams["numModels"], CellLine = cellLine, Signal = signal, Dataset = condition, Transitivity = igraph::transitivity(g),MeanDistance = igraph::mean_distance(g, directed = TRUE, unconnected = FALSE), Filename=fileNameTmp, MiPos = posMi, MiNeg = negMi)
  networkMetricsTmp$Accuracy <- sum(networkMetricsTmp$EModels + networkMetricsTmp$MModels)/networkMetricsTmp$ModelCount
  rownames(networkMetricsTmp) <- condition
  networkMetrics <- rbind(networkMetrics, networkMetricsTmp)
}
saveRDS(networkMetrics, file = "networkMetrics.rds")

The network metrics can be visualized using ggplot.

networkMetrics <- networkMetrics[networkMetrics$Connected,]
networkMetrics <- networkMetrics[which((networkMetrics$MGenes + networkMetrics$Egenes)>4),]

ggplot(networkMetrics) +
  geom_point(aes(x=Nodes,y=Accuracy), alpha=0.5, size = 0.5) +
  theme_bw()+
  xlab("Number of TFs") +
  #  facet_wrap(~Dataset, ncol = 3) +
  theme(text=element_text(size=20),
        axis.text.x=element_text(angle=90, hjust=1))

ggplot(networkMetrics) +
  geom_point(aes(x=Interactions,y=Accuracy),  alpha=0.5, size = 0.5) +
  theme_bw()+
  xlab("Number of Interactions") +
  # facet_wrap(~Dataset, ncol = 3) +
  theme(text=element_text(size=20),
        axis.text.x=element_text(angle=90, hjust=1))

plotData <- networkMetrics
plotData$EMNodes <- (networkMetrics$MGenes + networkMetrics$Egenes)/networkMetrics$Nodes


ggplot(plotData) +
  geom_box(aes(x=EMNodes ,y=Accuracy), alpha=0.5, size = 0.5) +
  theme_bw()+
  xlab("Fraction of EM TFs") +
  #  facet_wrap(~Dataset, ncol = 3) +
  theme(text=element_text(size=20),
        axis.text.x=element_text(angle=90, hjust=1))


plotData <- networkMetrics 
plotData <- plotData[plotData$Connected == TRUE,]
plotData$DiffFraction <- (plotData$Interactions - plotData$PosInt)/plotData$Interactions

ggplot(plotData, aes(x=Interactions,y=Accuracy)) +
  geom_point() +
  xlab("Number of Interactions") +
  theme_bw() +
  theme(text=element_text(size=20)
  )

ggplot(plotData, aes(x=Interactions,y=Accuracy)) +
  geom_point() +
  xlab("Number of Interactions") +
  theme_bw() +
  theme(text=element_text(size=20)
  )
plotData$EMFraction <- (plotData$Egenes+ plotData$MGenes)/plotData$Nodes
ggplot(plotData, aes(x=plotData$EMFraction,y=Accuracy)) +
  geom_point() +
  xlab("Fraction of EM TFs") +
  theme_bw() +
  theme(text=element_text(size=20)
  )


ggplot(plotData, aes(x=plotData$EMFraction,y=Accuracy)) +
  geom_point() +
  xlab("Fraction of EM TFs") +
  theme_bw() +
  facet_wrap(~Dataset) +
  theme(text=element_text(size=20)
  )

ggplot(plotData, aes(x=EMFraction,y=Accuracy)) +
  geom_point() +
  xlab("Fraction of EM TFs") +
  theme_bw() +
  facet_wrap(~Dataset) +
  theme(text=element_text(size=20)
  )



plotData$posIntFrac <- plotData$PosInt/plotData$Interactions
ggplot(plotData, aes(x=posIntFrac,y=Accuracy)) +
  geom_point() +
  xlab("Fraction of Excitatory Interactions") +
  theme_bw() +
  facet_wrap(~Dataset) +
  theme(text=element_text(size=20)
  )

plotData$NegIntFrac <- 1- plotData$posIntFrac
ggplot(plotData, aes(x=NegIntFrac,y=Accuracy)) +
  geom_point() +
  xlab("Fraction of Inhibitory Interactions") +
  theme_bw() +
  facet_wrap(~Dataset) +
  theme(text=element_text(size=20)
  )
ggplot(plotData, aes(x=plotData$EMFraction,y=Accuracy)) +
  geom_point() +
  xlab("Fraction of EM TFs") +
  theme_bw() +
  facet_wrap(~Dataset) +
  theme(text=element_text(size=20)
  )

Comparing experimental data with simulated data

We can plot the experimental and simulated datasets together to visualize whether the simulated data capture the TF activity derived from the experimental data. Here we show the results for the data from OVCA420 cell line with TGFB1 treatment and a network constructed using 0.4 and 0.1 mutual information cutoffs for excitatory and inhibitory interactions.

# Read the simulated data from the simulations above.
expRacipe <- readRDS(paste0("simAll/","OVCA420_TGFB1_0.4_0.1.rds"))
circuit <- sracipeCircuit(expRacipe)
networkTFs <- union(circuit$Source,circuit$Target)
expRacipeNorm <- sracipeNormalize(expRacipe)
expData <- assay(expRacipeNorm,1)

expDataOrig <- assay(expRacipe,1)
expDataOrig <- log2(1+expDataOrig)
expDataOrig <- expDataOrig[commonTfs,]

tmpMeans <- rowMeans(expDataOrig)
tmpSds <- apply(expDataOrig,1,sd)

seuratFwd <- readRDS(file = "output/auc/seurat_regsOVCA420_TGFB1_fwd.Rds")
actFwd <- seuratFwd@assays$RNA@scale.data
tfNames <- rownames(actFwd)
tfNames <- substr(tfNames, 0, nchar(tfNames)-3)
rownames(actFwd) <- substr(rownames(actFwd), 0, nchar(rownames(actFwd))-3)
commonTfs <- intersect(networkTFs,rownames(actFwd))

seuratRev <- readRDS(file = "output/auc/seurat_regsOVCA420_TGFB1_rev.Rds")
actRev <- seuratRev@assays$RNA@scale.data
tfNames <- rownames(actRev)
tfNames <- substr(tfNames, 0, nchar(tfNames)-3)
rownames(actRev) <- substr(rownames(actRev), 0, nchar(rownames(actRev))-3)
commonTfs <- intersect(commonTfs,rownames(actRev))

expData <- expData[commonTfs,]
actFwd <- actFwd[commonTfs,]
actRev <- actRev[commonTfs,]

plotColor <- c("#5E4FA2", "#4F61AA", "#4173B3", "#3386BC", "#4198B6",
               "#51ABAE", "#62BEA6", "#77C8A4", "#8ED1A4", "#A4DAA4",
               "#B8E2A1", "#CBEA9D", "#DEF199", "#EAF69F", "#F2FAAC",
               "#FAFDB8", "#FEFAB6", "#FEF0A5", "#FEE695", "#FDD985",
               "#FDC978", "#FDB96A", "#FCA75E", "#F99254", "#F67D4A",
               "#F26943", "#E85A47", "#DE4B4B", "#D33C4E", "#C1284A",
               "#AF1446", "#9E0142")

pca = prcomp(t(expData), scale. = T, center = T)
pcaData <- pca$x
pcaDataFwd <- (scale(t(actFwd), pca$center, pca$scale) %*%
                 pca$rotation)
pcaDataFwd <- data.frame(PC1=pcaDataFwd[,1],PC2=pcaDataFwd[,2], Time = seuratFwd$Time)
pcaDataRev <- (scale(t(actRev), pca$center, pca$scale) %*%
                 pca$rotation)
pcaDataRev <- data.frame(PC1=pcaDataRev[,1],PC2=pcaDataRev[,2], Time = seuratRev$Time)
expDataSim <- data.frame(pca$x[,1:2])
expDataSim$Time <- "Sim"
pcaDataFwd$Dir <- "Fwd"
pcaDataRev$Dir <- "Rev"
expDataSim$Dir <- "None"

# Fit a linear model to activities from cells from day 7 from forward and
# reverse directions.

sevenD <- rownames(pcaDataFwd[which(pcaDataFwd$Time == "7d"),])
sevenDFwd <- pcaDataFwd[sevenD,]$PC1
sevenDRev  <- pcaDataRev[sevenD,]$PC1
pred <- pcaDataRev$PC1
pcaDataRevScaled <- data.frame(PC1 = predict(lm(sevenDFwd ~ sevenDRev),data.frame(sevenDRev = pred)))

sevenDFwd <- pcaDataFwd[sevenD,]$PC2
sevenDRev  <- pcaDataRev[sevenD,]$PC2
pred <- pcaDataRev$PC2
pcaDataRevScaled$PC2 <- predict(lm(sevenDFwd ~ sevenDRev),data.frame(sevenDRev = pred))

pcaDataRevScaled$Time <- pcaDataRev$Time
pcaDataRevScaled$Dir <- pcaDataRev$Dir
allData <- rbind(expDataSim, pcaDataFwd,pcaDataRevScaled)
allData <- rbind(pcaDataFwd,pcaDataRevScaled)
allData <- (pcaDataRevScaled)
# allData <- (pcaDataRev)
# allData <- pcaDataFwd
allData <- expDataSim
p <- ggplot2::ggplot(allData) +
  #    geom_point(aes(x = allData[,1], y =allData[,2], color = Time, shape = Dir), alpha = 0.5) +
  xlab(paste0("PC1(",100*summary(pca)$importance[2,1],"%)")) +
  ylab(paste0("PC2(",100*summary(pca)$importance[2,2],"%)")) +
  #    stat_summary(aes(x = allData[,1], y =allData[,2], group=Dir, color = Dir), fun.y=mean, geom="line") +
  geom_point(aes(x = allData[,1], y =allData[,2], group=Dir, color = Time, shape = Dir), alpha = 0.5) +
  theme_bw() +
  theme(text = element_text(size=15)) +
xlim(-5,11) +
  ylim(-5,4)
  NULL

Parametric perturbations of models in simulations

We can modify the parameters for selected models and simulate the circuit with modified parameters. Here, we increased the production rate of JUN and RELB and used the steady state values from unperturbed simulations as the initial conditions for new stochastic simulations. For more details, please see here.

library(sRACIPE)
library(ggplot2)
newRacipe <- readRDS("eModelsRacipe.rds")
tmp <- sracipeParams(newRacipe)
tmp[,"G_JUN"] <- 10000*tmp[,"G_JUN"]
tmp[,"G_RELB"] <- 10000*tmp[,"G_RELB"]
sracipeParams(newRacipe) <- tmp
tmp <- assay(newRacipe,1)
sracipeIC(newRacipe) <- (tmp)
set.seed(123)
newRacipe2_D05_T2000 <-  sracipeSimulate(
  newRacipe, numModels = 1543, initialNoise = 0.05,simDet = FALSE,
  nNoise=0,stepper = "EM", plots = FALSE,integrateStepSize = 0.01,
  genParams = FALSE,genIC = FALSE, simulationTime = 2000)

saveRDS(newRacipe2_D05_T2000, file = "eModelsRacipe_D05_T2000.rds")

# We can instead also record the state of the models at multiple time points,
# for example, here we record the state at every 0.02 time. 

newRacipeTime <-  sracipeSimulate(
  newRacipe, numModels = 1543, initialNoise = 0.05,simDet = FALSE,
  nNoise=0,stepper = "EM", plots = FALSE,integrateStepSize = 0.01,
  genParams = FALSE,genIC = FALSE, simulationTime = 50, printInterval = 0.02, printStart = 0)

saveRDS(newRacipeTime, file = "eModelsRacipe_D05_T50_interval02.rds")