We downloaded the rds files from the google drive link provided in the Cook et al. manuscript. The rds files contain the preprocessed data which we extracted and used for subsequent analysis. We separated the data from each treatment into two groups - forward transition (8 hour, Day 0,1,3,7) or reverse transition (Day 7, after removal - 8 hour, Day 1, 3). The data from day 7 were included in both datasets. We wrote the data to .tsv files and used those as the input file in SCENIC.
# Prepare data for processing
library("Seurat")
library(Seurat)
library(stringr)
library(Seurat)
library(jsonlite)
library(sjmisc)
library(MASS)
library(sRACIPE)
library(UpSetR)
library(ggplot2)
dir <- "/EmtRds" # Directory where the downloaded rds files are stored.
files <- list.files(dir)
for(i in seq_along(files)){
filename <- paste0(dir,files[i])
data <- readRDS(filename)
fwd <- subset(data, Time %in% c("0d", "1d", "3d", "7d", "8h"))
tmp <- as.matrix(Seurat::GetAssayData(fwd[["RNA"]], slot = "data"))
write.table(tmp,row.names = TRUE,col.names = TRUE, sep = "\t", file = paste0(dir,"expression/",tools::file_path_sans_ext(files[i]),"_fwd.tsv"),quote = FALSE)
rm(tmp,fwd)
rev <- subset(data, Time %in% c("8h_rm", "1d_rm","3d_rm", "7d"))
tmp <- as.matrix(Seurat::GetAssayData(rev[["RNA"]], slot = "data"))
write.table(tmp,row.names = TRUE,col.names = TRUE, sep = "\t", file = paste0(dir,"expression/",tools::file_path_sans_ext(files[i]),"_rev.tsv"),quote = FALSE)
}
We downloaded the “hg19 cis target databases” and “motif to tf annotation” from cisTarget databaes and used those in pySCENIC v0.9.19 alongwith the .tsv files generated above. We followed the pySCENIC full pipeline code for running pySCENIC. Our specific python script is given below.
if __name__ == '__main__':
import os
import glob
import pickle
import pandas as pd
import numpy as np
import json
import seaborn as sns
import sys
from dask.diagnostics import ProgressBar
from arboreto.utils import load_tf_names
from arboreto.algo import grnboost2
from pyscenic.rnkdb import FeatherRankingDatabase as RankingDatabase
from pyscenic.utils import modules_from_adjacencies, load_motifs
from pyscenic.prune import prune2df, df2regulons
from pyscenic.aucell import aucell
inputFilename = sys.argv[1] # Name of the specific data file provided as input through the shell script file
RESULT_FOLDER = "../results"
DATA_FOLDER = "../expression"
RESOURCES_FOLDER = "../resources"
DATABASE_FOLDER = "../databases/"
SCHEDULER = "123.122.8.24:8786"
DATABASES_GLOB = os.path.join(DATABASE_FOLDER, "hg19*.mc9nr.feather")
MOTIF_ANNOTATIONS_FNAME = os.path.join(DATABASE_FOLDER, "motifs-v9-nr.hgnc-m0.001-o0.0.tbl")
MM_TFS_FNAME = os.path.join(RESOURCES_FOLDER, 'hs_hgnc_curated_tfs.txt')
SC_EXP_FNAME = os.path.join(DATA_FOLDER, (inputFilename + ".tsv"))
MODULES_FNAME = os.path.join(RESULT_FOLDER, ("modules_" + inputFilename + ".p"))
REGULONS_FNAME = os.path.join(RESULT_FOLDER, "regulons_" + inputFilename + ".p")
MOTIFS_FNAME = os.path.join(RESULT_FOLDER, "motifs_" + inputFilename + ".csv")
ex_matrix = pd.read_csv(SC_EXP_FNAME, sep='\t', header=0, index_col=0).T
print(ex_matrix.shape)
tf_names = load_tf_names(MM_TFS_FNAME)
print("tf names loaded")
out_file = os.path.join(RESULT_FOLDER, "grn_output_" + inputFilename + ".tsv")
db_fnames = glob.glob(DATABASES_GLOB)
def name(fname):
return os.path.splitext(os.path.basename(fname))[0]
dbs = [RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames]
print(dbs)
print("running grnboost")
adjacencies = grnboost2(ex_matrix, tf_names=tf_names, verbose=True)
adjacencies.head()
print("identify modules")
adjacencies.to_csv(out_file, sep='\t', index=False, header=False)
print("grnboost done")
# adjacencies = pd.read_csv(out_file, sep='\t', names=["TF", "target", "importance"])
modules = list(modules_from_adjacencies(adjacencies, ex_matrix, rho_mask_dropouts=True))
print("writing modules")
with open(MODULES_FNAME, 'wb') as f:
pickle.dump(modules, f)
# print("reading modules")
# with open(MODULES_FNAME, 'rb') as f:
# modules = pickle.load(f)
print("Finding Enriched modules")
# Calculate a list of enriched motifs and the corresponding target genes for all modules.
with ProgressBar():
df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME)
df.head()
# Create regulons from this table of enriched motifs.
print("creating regulons")
regulons = df2regulons(df)
print("writing regulons")
# Save the enriched motifs and the discovered regulons to disk.
df.to_csv(MOTIFS_FNAME)
with open(REGULONS_FNAME, "wb") as f:
pickle.dump(regulons, f)
print("Finding AUC of cells")
auc_mtx = aucell(ex_matrix, regulons, num_workers=1)
auc_file = os.path.join(RESULT_FOLDER, "AUC_" + inputFilename + ".csv")
auc_mtx.to_csv(auc_file)
# Write to JSON for import to R
rdict = {}
for reg in regulons:
# print(reg)
targets = [ target for target in reg.gene2weight ]
rdict[reg.name] = targets
rjson = json.dumps(rdict)
jsonFile = os.path.join(RESULT_FOLDER, "regulons_" + inputFilename + ".json")
f = open(jsonFile,"w")
f.write(rjson)
f.close()
# Plot the results for visualization
sns_plot = sns.clustermap(auc_mtx, figsize=(8,8))
plotFile = os.path.join(RESULT_FOLDER, "AUC_" + inputFilename + ".png")
sns_plot.savefig(plotFile)
The pySCENIC pipeline generated regulons and the TF activities which were used for subsequent analysis in R. First, we processed the calculated activities using Seurat and created Seurat objects containing the activities.
options(stringsAsFactors = FALSE)
## Network Construction
treatments <- c("EGF","TGFB1", "TNF")
cancer_types <- c("A549","DU145","MCF7","OVCA420")
direction <- c("fwd", "rev")
for(c in 1:length(cancer_types)) {
## Read in seurat object and regulon data
# c <- 2
data_dir <- "EmtRds/"
seurat <- readRDS(paste0(data_dir,cancer_types[c],"_",t,".rds"))
auc_fname <- paste0(data_dir,"/SCENIC/results/AUC_",cancer_types[c],"_",t,"_fwd.csv")
auc_matrix <- read.table(auc_fname, header=TRUE, sep = ",", row.names = 1, check.names = FALSE)
auc_matrix <- t(auc_matrix)
seuratFwd <- subset(seurat, Time %in% c("0d", "1d", "3d", "7d", "8h"))
auc_matrix <- auc_matrix[,colnames(seuratFwd)]
seurat_regs <- CreateSeuratObject(counts = auc_matrix)
seurat_regs <- AddMetaData(seurat_regs, metadata = FetchData(seurat, "Time"))
Idents(seurat_regs) <- "Time"
allGenes <- rownames(seurat_regs)
seurat_regs <- ScaleData(seurat_regs, features = allGenes)
seurat_regs <- FindVariableFeatures(seurat_regs)
seurat_regs <- RunPCA(seurat_regs)
seurat_regs <- RunUMAP(seurat_regs, dims = 1:50)
## Save Rds files
saveRDS(seurat_regs, file = paste0(output_dir,"seurat_regs",cancer_types[c],"_",t,"_fwd.Rds"))
auc_fname <- paste0(data_dir,"/SCENIC/results/AUC_",cancer_types[c],"_",t,"_rev.CSV")
auc_matrix <- read.table(auc_fname, header=TRUE, sep = ",", row.names = 1, check.names = FALSE)
auc_matrix <- t(auc_matrix)
seuratRev <- subset(seurat, Time %in% c("1d_rm","3d_rm", "7d", "8h_rm" ))
auc_matrix <- auc_matrix[,colnames(seuratRev)]
seurat_regs <- CreateSeuratObject(counts = auc_matrix)
seurat_regs <- AddMetaData(seurat_regs, metadata = FetchData(seurat, "Time"))
Idents(seurat_regs) <- "Time"
allGenes <- rownames(seurat_regs)
seurat_regs <- ScaleData(seurat_regs, features = allGenes)
seurat_regs <- FindVariableFeatures(seurat_regs)
seurat_regs <- RunPCA(seurat_regs)
seurat_regs <- RunUMAP(seurat_regs, dims = 1:50)
saveRDS(seurat_regs, file = paste0(output_dir,"seurat_regs",cancer_types[c],"_",t,"_rev.Rds"))
}
Next, we identified the differentially activated transcription factors (DATFs) between 0d and other time points in the forward transition and 7 days and other time points in the reverse direction.
options(stringsAsFactors = FALSE)
## Network Construction
treatments <- c("EGF","TGFB1", "TNF")
cancer_types <- c("A549","DU145","MCF7","OVCA420")
direction <- c("fwd", "rev")
counter <- 1
degs <- data.frame()
for(tCounter in seq_along(treatments)){
for(cCounter in seq_along(cancer_types)){
for(dCounter in seq_along(direction)){
## Select a treatment to work with (or iterate thru t in a loop)
t <- treatments[tCounter]
c <- cancer_types[cCounter]
direc <- direction[dCounter]
treatmentName <- paste0(c,"_",t,"_",direc)
print(treatmentName)
output_dir <- paste0("output/auc/")
seurat_regs <- readRDS(paste0("output/auc/seurat_regs",c,"_",t,"_",direc,".Rds"))
if(direc == "fwd"){
for(degTime in c("8h", "1d", "3d", "7d")){
deg <- FindMarkers(seurat_regs, ident.1 = degTime, ident.2 = "0d", logfc.threshold = 0.0)
deg$CellLine <- c
deg$gene <- rownames(deg)
deg$Comparison <- paste0(c,"_",t,"_",direc)
deg$Time <- degTime
degs <- rbind(degs,deg)
}
}
if(direc == "rev"){
for(degTime in c("8h_rm", "1d_rm", "3d_rm")){
deg <- FindMarkers(seurat_regs, ident.1 = degTime, ident.2 = "7d", logfc.threshold = 0.0)
deg$CellLine <- c
deg$gene <- rownames(deg)
deg$Comparison <- paste0(c,"_",t,"_",direc)
deg$Time <- degTime
degs <- rbind(degs,deg)
}
}
}
}
}
saveRDS(degs, file = "degRegsFwd0dRev7d.rds")
Next, we looked at the overlap between the DATFs across various treatments. We used the top 100 DATFs from each comparison.
library(Seurat)
library(jsonlite)
library(sjmisc)
library(stringr)
library(MASS)
library(sRACIPE)
library(UpSetR)
library(ggplot2)
options(stringsAsFactors = FALSE)
## NEtwork Construction
treatments <- c("EGF","TGFB1", "TNF")
cancer_types <- c("A549","DU145","MCF7","OVCA420")
direction <- c("fwd", "rev")
counter <- 1
treatmentNames <- c()
for(tCounter in seq_along(treatments)){
for(cCounter in seq_along(cancer_types)){
t <- treatments[tCounter]
c <- cancer_types[cCounter]
treatmentNames <- c(treatmentNames,paste0(c,"_",t))
}
}
degs <- readRDS(file = "output/network_construction/degRegsFwd0dRev7d.rds")
degs$Signal <- str_match(degs$Comparison, "_(.*?)_")[,2]
diffRegulonList <- list()
nDiffRegulon <- 100
set.seed(123)
for(tCounter in seq_along(treatments)){
for(cCounter in seq_along(cancer_types)){
for(dCounter in seq_along(direction)){
t <- treatments[tCounter]
c <- cancer_types[cCounter]
direc <- direction[dCounter]
treatmentName <- paste0(c,"_",t,"_",direc)
print(treatmentName)
output_dir <- paste0("output/auc/")
deg_reg <- degs[degs$Comparison == paste0(c,"_",t,"_",direc),]
seurat_regs <- readRDS(paste0("output/auc/seurat_regs",c,"_",t,"_",direc,".Rds"))
regulon_fname <- paste0("EmtRds/SCENIC/results/regulons_",c,"_",t,"_",direc,".json")
regs <- read_json(regulon_fname, simplify = T)
networkReg <- deg_reg$gene[rank(deg_reg$p_val_adj, ties.method = "random")]
networkReg <- unique(networkReg)
networkReg <- networkReg[1:min(nDiffRegulon,length(networkReg))]
regs <- regs[networkReg]
diffRegulonList[[treatmentName]] <- regs
}
}
}
saveRDS(diffRegulonList, file = "diffRegulonTop100Deg0d7d.rds")
diffRegulonNames <- lapply(diffRegulonList, function(x) names(x))
diffRegulonFwdRev <- list()
for(i in 1:(length(diffRegulonList)/2)){
diffRegulonFwdRev[[i]] <- union(diffRegulonNames[[2*i-1]],diffRegulonNames[[2*i]])
}
names(diffRegulonFwdRev) <- treatmentNames
saveRDS(diffRegulonFwdRev, file = "diffRegulonFwdRev.rds")
diffRegulonFwdRev = diffRegulonFwdRev[order(names(diffRegulonFwdRev))]
upset(fromList(diffRegulonFwdRev),keep.order = F, group.by = "degree", text.scale = 2, nsets = length(diffRegulonFwdRev))
The DATFs were used to identify core TFs which occur most frequently.
library(dplyr)
degs <- readRDS(file = "degRegsFwd0dRev7d.rds")
degsFiltered <- degs[degs$p_val_adj < 0.05,]
TfFrequency <- degsFiltered %>%
group_by(gene) %>%
summarize(n())
hist(TfFrequency$`n()`, breaks = 20)
coreTf <- TfFrequency$gene[TfFrequency$`n()` > 23]
saveRDS(coreTf, file = "coreTf.rds")
To understand the phenotypic distribution of states in terms of regulon activity, we performed UMAP dimensional reduction on all conditions using activity data from all identified regulons. In order to have comparable values for the fwd timepoints (0d-7d after signal induction) and the rev timepoints (7d-3d_rm, denoting 3 days after signal interruption), we scaled the regulon activity for rev timepoints using a linear model such that the 7d data aligned for both directions. We then visualized the UMAP values on a density map with the centroids for each timepoint indicated by a black point.
### Setup
rm(list = ls())
library(Seurat)
library(jsonlite)
library(sjmisc)
library(stringr)
library(varhandle)
library(sRACIPE)
library(plyr)
library(ggplot2)
library(ggpmisc)
library(rowr)
library(spatstat.utils)
library(umap)
library(ggthemes)
## Initialize some values
treatments <- c("EGF","TGFB1", "TNF")
cancer_types <- c("A549","DU145","MCF7","OVCA420")
directions <- c("fwd","rev")
output_dir <- "output/cook/network_construction/dec1/"
data_dir <- "data/cook/"
scenic_data_dir <- "data/cook/SCENIC/results/"
network_dir <- "networks/vivekCode/cellLineNetworks/"
plotFName <- "figs/SI/revision/UMAP_density_alpha.pdf"
working_directory <- getwd()
cbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#E84855")
## Color setup for plots
rf <- colorRampPalette(rev(brewer.pal(11,'Spectral')))
plot_color <- rf(20)
## Make plot list
umap_density_list <- list()
iter <- 1
## Loop thru cell lines, treatments, directions
for(cancer in seq_along(cancer_types)) {
c <- cancer_types[cancer]
for(treatment in seq_along(treatments)) {
t <- treatments[treatment]
shared_regs <- c()
print(paste0("Starting work on ",c,"_",t))
for(dir in seq_along(directions)) {
d <- directions[dir]
if(d == "fwd") {
seurat_regs_fwd <- readRDS(paste0("output/auc/seurat_regs",c,"_",t,"_",d,".Rds"))
auc_mtx_fwd <- as.data.frame(t(GetAssayData(seurat_regs_fwd, slot = "scale.data")))
auc_mtx_fwd$Time <- FetchData(seurat_regs_fwd, "Time")[,"Time"]
auc_mtx_fwd$Direction <- d
auc_mtx_fwd$Cell <- rownames(auc_mtx_fwd)
} else {
seurat_regs_rev <- readRDS(paste0("output/auc/seurat_regs",c,"_",t,"_",d,".Rds"))
auc_mtx_rev <- as.data.frame(t(GetAssayData(seurat_regs_rev, slot = "scale.data")))
auc_mtx_rev$Time <- FetchData(seurat_regs_rev, "Time")[,"Time"]
auc_mtx_rev$Direction <- d
auc_mtx_rev$Cell <- rownames(auc_mtx_rev)
}
}
## Scale using 7d data
shared_regs <- colnames(auc_mtx_fwd)[which(colnames(auc_mtx_fwd) %in% colnames(auc_mtx_rev))]
shared_regs_short <- shared_regs[-((length(shared_regs)-2):length(shared_regs))]
auc_mtx_fwd_scale <- auc_mtx_fwd[,shared_regs]
auc_mtx_rev_scale <- auc_mtx_rev[,shared_regs]
for(reg in shared_regs_short) {
fwd_7d_reg <- auc_mtx_fwd_scale[which(auc_mtx_fwd_scale$Direction == "fwd" &
auc_mtx_fwd_scale$Time == "7d"),reg]
rev_7d_reg <- auc_mtx_rev_scale[which(auc_mtx_rev_scale$Direction == "rev"&
auc_mtx_rev_scale$Time == "7d"),reg]
## Generate linear models
x_lm <- lm(fwd_7d_reg ~ rev_7d_reg)
new_rev_reg <- (auc_mtx_rev_scale[which(auc_mtx_rev_scale$Direction == "rev" ),reg] *
unname(x_lm$coefficients["rev_7d_reg"])) +
unname(x_lm$coefficients["(Intercept)"])
## Replace reverse data
auc_mtx_rev_scale[,reg] <- new_rev_reg
}
## Combine fwd and rev data
auc_mtx_fwd_scale <- auc_mtx_fwd_scale[which(!auc_mtx_fwd_scale$Time == "7d"),]
combined_mtx <- rbind(auc_mtx_fwd_scale, auc_mtx_rev_scale)
na_regs <- colnames(combined_mtx[,colSums(is.na(combined_mtx)) != 0])
combined_mtx_noNA <- combined_mtx[ , colSums(is.na(combined_mtx)) == 0]
pca_regs <- shared_regs_short[which(shared_regs_short %in% colnames(combined_mtx_noNA))]
combined_cellInfo <- combined_mtx[,c("Time","Direction",'Cell')]
## Progress report
print(paste0("Finished scaling data for ",c,"_",t,". Starting dimensional reduction."))
## Run UMAP
umap_rs <- umap(combined_mtx_noNA[,pca_regs])
umap_data <- as.data.frame(umap_rs$layout)
colnames(umap_data) <- c("x","y")
if(all.equal(combined_cellInfo$Cell, rownames(umap_data))) {
umap_data$Time <- combined_cellInfo$Time
}
umap_data$Time <- factor(umap_data$Time, levels = c("0d", "8h", "1d", "3d", "7d", "8h_rm", "1d_rm", "3d_rm"))
## Progress report
print(paste0("Done with dimensional reductions for ",c,"_",t,". Creating plots now."))
## Calculate centroids for each timepoint
centroid_df_umap <- aggregate(umap_data[,c("x","y")], list(Time=umap_data$Time), mean)
## Plot UMAP density with timepoint data
ggscatter <- heatscatter(umap_data[,1], umap_data[,2],
xlab = "UMAP1",
ylab= "UMAP2",
ggplot = TRUE)
image <- ggscatter +
geom_point(data=centroid_df_umap, mapping = aes(x=x, y=y, size=Time, alpha=Time)) +
geom_path(data = centroid_df_umap, arrow = arrow(), linetype="dashed") +
guides(alpha=FALSE, size=FALSE, color=FALSE,
shape=FALSE) +
scale_shape_manual(values = c(15,16,17,18,8,21,24,23)) +
scale_alpha_discrete(breaks=c(0.4,0.485,0.57,0.655,0.74,0.825,0.91,1)) +
ggtitle(paste(c," ",t)) +
theme(title = element_text(size=20),
axis.title = element_text(size = 20),
axis.text = element_text(size=18),
plot.subtitle = element_text(size=16),
legend.title = element_text(size=20),
legend.title.align = 0,
legend.text = element_text(size=18),
legend.key.size = unit(20, units = "pt")) +
labs(subtitle = paste0("UMAP on ",length(pca_regs), " regulons in ", nrow(umap_data), " cells"))
if(t == "TNF") {
image <- image +
guides(size=guide_legend(title = "Time"),
color=guide_legend(override.aes = list(size = 3.5)),
shape=guide_legend(override.aes = list(size = 3.5)))
}
umap_density_list[[iter]] <- image
## Increase iterator
iter <- iter+1
}
}
## Save all plots
pdf(file = plotFName, width=11.5, height=8)
for(plot in umap_density_list) {
print(plot, newpage = TRUE)
}
graphics.off()
To understand how different regulons participate in different phases of the transition, we visualized the activity of 28 highly conserved regulons over time in each experimental condition.
### Setup
rm(list = ls())
library(Seurat)
library(jsonlite)
library(sjmisc)
library(stringr)
library(varhandle)
library(sRACIPE)
library(plyr)
library(ggplot2)
library(ggpmisc)
library(rowr)
library(spatstat.utils)
library(umap)
library(ggthemes)
## ggplot theme for graphics
ggtheme <- function (base_size = 11, base_family = "") {
theme_get() %+replace%
theme(axis.title = element_text(size = 16),
axis.text = element_text(size=14),
axis.line = element_line(color = "black", linetype = "solid"),
title = element_text(size = 16),
legend.title = element_text(size=16),
legend.title.align = 0,
legend.text = element_text(size=14),
legend.key.size = unit(20, units = "pt"),
panel.grid = element_blank(),
panel.background = element_blank())
}
# Multiple plot function from http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols: Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
## Initialize some values
treatments <- c("EGF","TGFB1", "TNF")
cancer_types <- c("A549","DU145","MCF7","OVCA420")
directions <- c("fwd","rev")
output_dir <- "output/cook/network_construction/dec1/"
data_dir <- "data/cook/"
scenic_data_dir <- "data/cook/SCENIC/results/"
network_dir <- "networks/vivekCode/cellLineNetworks/"
working_directory <- getwd()
cbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#E84855")
## Get reg list
reg_list <- c("BHLHE40(+)", "CEBPB(+)", "CREB3(+)", "CREM(+)", "ELK3(+)",
'ESRRA(+)', 'ETS1(+)', 'ETS2(+)', 'FOS(+)', 'FOSB(+)',
'FOSL1(+)', 'IRF1(+)', 'IRF2(+)', 'IRF3(+)', 'IRF7(+)',
'JUN(+)', 'JUNB(+)', 'KLF6(+)', 'MAFG(+)', 'MYC(+)',
'NFE2L3(+)', 'NFKB2(+)', 'RELB(+)', 'SOX4(+)', 'SPDEF(+)',
'STAT1(+)', 'STAT3(+)', 'XBP1(+)')
gene_list <- reg_list
## make dummy plot for null cases
dummy_df <- data.frame(x=1,y=1)
dummy_plot <- ggplot(data = dummy_df) +
geom_point(aes(x=x, y=y, alpha=0.0001)) +
guides(alpha=FALSE) +
ggtheme() +
geom_text(mapping = aes(x=1, y=1, label="NULL"))
for(g in seq_along(gene_list)) {
gene_plotlist <- list()
iter <- 1
for(c in cancer_types) {
for(t in treatments) {
gene_df_full <- data.frame(Activity=numeric(), Time=character(),Direction=character())
for(d in directions) {
## Read auc matrix, seurat object, de regs
treatmentName <- paste0(c,"_",t,"_",d)
seurat_regs <- readRDS(paste0("output/auc/seurat_regs",c,"_",t,"_",d,".Rds"))
auc_mtx <- GetAssayData(seurat_regs, slot = "scale.data")
## Select gene of interest
if(gene_list[g] %in% rownames(auc_mtx)) {
gene_df <- as.data.frame(auc_mtx[gene_list[g],])
colnames(gene_df)[1] <- "Activity"
gene_df$Time <- FetchData(seurat_regs, "Time")[,"Time"]
gene_df$Direction <- d
gene_df_full <- rbind(gene_df_full, gene_df)
}
}
## Only proceed if there is data for the regulon - also correct color scheme if data for one direction is missing
if(nrow(gene_df_full) > 0) {
gene_df_full$Time <- factor(gene_df_full$Time, levels = c("0d", "8h", "1d", "3d", "7d", "8h_rm", "1d_rm", "3d_rm"))
if(length(unique(gene_df_full$Direction)) == 1) {
if(unique(gene_df_full$Direction) == "fwd") {
plotColors <- cbPalette
} else {
plotColors <- cbPalette[5:8]
}
} else {
plotColors <- cbPalette
}
## Plot
image <- ggplot(gene_df_full, aes(x=Time,y=Activity, fill=Time)) +
geom_boxplot() +
ggtheme() +
ylab(paste0(gene_list[g])) +
scale_fill_manual(values=plotColors) +
guides(fill=FALSE) +
facet_wrap(~Direction, scales = "free") +
theme(axis.text.y = element_text(size = 14),
title = element_text(size = 20),
axis.title.y = element_text(size = 16),
legend.text = element_text(size = 12),
legend.title = element_text(size = 14),
strip.text = element_text(size = 16),
axis.text.x = element_blank(),
axis.title.x = element_blank()) +
ggtitle(paste0(c," ", t))
if(t == "TNF") {
image <- image +
theme(axis.text.x = element_text(size = 16, angle = 90),
axis.title.x = element_text(size=20))
}
gene_plotlist[[iter]] <- image
iter <- iter+1
}
else {
gene_plotlist[[iter]] <- dummy_plot
iter <- iter+1
}
}
}
## Save plot for this regulon
title <- paste0("figs/fig3/",gene_list[g],"_time_activity.pdf")
pdf(file = title, width = 22, height = 17)
multiplot(plotlist = gene_plotlist, cols = 4)
dev.off()
}
Using three specific combinations of regulons known to be highly involved in the propagation of the EMT-inducing signals, specifically 1) RELB+NFKB2 vs FOSL1+JUNB, 2) FOSL1 vs RELB, and 3) JUNB vs RELB, we then visualized the average activity of these TFs over time for each condition. As in the earlier UMAP density plots, the regulon activity data were scaled on a linear model such that the
### Setup
rm(list = ls())
library(Seurat)
library(jsonlite)
library(sjmisc)
library(stringr)
library(varhandle)
library(sRACIPE)
library(plyr)
library(ggplot2)
library(ggpmisc)
library(rowr)
library(spatstat.utils)
library(umap)
library(ggthemes)
library(RColorBrewer)
## ggplot theme for graphics
ggtheme <- function (base_size = 11, base_family = "") {
theme_get() %+replace%
theme(axis.title = element_text(size = 18),
axis.text = element_text(size=16),
axis.line = element_line(color = "black", linetype = "solid"),
title = element_text(size = 20),
legend.title = element_text(size=18),
legend.title.align = 0,
legend.text = element_text(size=16),
legend.key.size = unit(20, units = "pt"),
panel.grid = element_blank(),
panel.background = element_blank())
}
# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols: Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
## Initialize some values
treatments <- c("EGF","TGFB1", "TNF")
cancer_types <- c("A549","DU145","MCF7","OVCA420")
directions <- c("fwd","rev")
output_dir <- "output/cook/network_construction/dec1/"
data_dir <- "data/cook/"
scenic_data_dir <- "data/cook/SCENIC/results/"
network_dir <- "networks/vivekCode/cellLineNetworks/"
working_directory <- getwd()
cbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#E84855")
## Parameters
genepair_lists <- list(c("RELB+NFKB2","FOSL1+JUNB"),c("FOSL1","RELB"),c("JUNB","RELB"))
## make dummy plot for null cases
dummy_df <- data.frame(x=1,y=1)
dummy_plot <- ggplot(data = dummy_df) +
geom_point(aes(x=x, y=y, alpha=0.0001)) +
guides(alpha=FALSE) +
ggtheme() +
geom_text(mapping = aes(x=1, y=1, label="NULL"))
for(genes in genepair_lists) {
trans_centroid_plotlist <- list()
iter <- 1
g1_expanded <- strsplit(genes[1],"\\+")[[1]]
g2_expanded <- strsplit(genes[2],"\\+")[[1]]
full_genelist <- c(g1_expanded, g2_expanded)
regs_g1 <-paste0(g1_expanded, "(+)")
regs_g2 <-paste0(g2_expanded, "(+)")
regs <- paste0(full_genelist, "(+)")
## Loop thru cancer types and treatments (12 conditions)
for(c in cancer_types) {
for(t in treatments) {
activity_df_full <- data.frame(x=numeric(),y=numeric(),Time=character(),Direction=character())
# Combine data for fwd and rev
for(d in directions) {
seurat_regs <- readRDS(paste0("output/auc/seurat_regs",c,"_",t,"_",d,".Rds"))
if(length(regs[which(regs %in% rownames(seurat_regs))]) == length(regs)) {
auc_mtx <- as.data.frame(t(GetAssayData(seurat_regs, slot = "scale.data")[regs,]))
if(length(regs) == 2) {
colnames(auc_mtx) <- c("x","y")
auc_mtx$Time <- FetchData(seurat_regs, "Time")[,"Time"]
auc_mtx$Direction <- d
}
else if(length(regs) == 4){
auc_mtx$x <- rowSums(auc_mtx[,c(regs_g1)], dims = 1)
auc_mtx$y <- rowSums(auc_mtx[,c(regs_g2)], dims = 1)
auc_mtx <- auc_mtx[,c("x","y")]
auc_mtx$Time <- FetchData(seurat_regs, "Time")[,"Time"]
auc_mtx$Direction <- d
}
else {
auc_mtx <- data.frame()
}
} else if(c == "MCF7" & t == "EGF") {
## Handle an exception case where data is missing for one regulon
auc_mtx <- as.data.frame(t(GetAssayData(seurat_regs, slot = "scale.data")[c("RELB(+)",regs_g2),]))
auc_mtx$x <- auc_mtx$`RELB(+)`
auc_mtx$y <- rowSums(auc_mtx[,c(regs_g2)], dims = 1)
auc_mtx <- auc_mtx[,c("x","y")]
auc_mtx$Time <- FetchData(seurat_regs, "Time")[,"Time"]
auc_mtx$Direction <- d
}
else {
auc_mtx <- data.frame()
}
activity_df_full <- rbind(activity_df_full, auc_mtx)
}
## If data for both directions is present, scale rev data to match fwd
if(nrow(activity_df_full) > 0 & length(unique(activity_df_full$Direction)) == 2) {
## Set colors
activity_df_full$Time <- factor(activity_df_full$Time, levels = c("0d", "8h", "1d", "3d", "7d", "8h_rm", "1d_rm", "3d_rm"))
plotColors <- cbPalette
#### Scaled plots
if(length(unique(activity_df_full$Direction)) == 2) {
fwd_7d_x <- activity_df_full[which(activity_df_full$Direction == "fwd" & activity_df_full$Time == "7d"),"x"]
fwd_7d_y <- activity_df_full[which(activity_df_full$Direction == "fwd"& activity_df_full$Time == "7d"),"y"]
rev_7d_x <- activity_df_full[which(activity_df_full$Direction == "rev"& activity_df_full$Time == "7d"),"x"]
rev_7d_y <- activity_df_full[which(activity_df_full$Direction == "rev"& activity_df_full$Time == "7d"),"y"]
## Generate linear models
x_lm <- lm(fwd_7d_x ~ rev_7d_x)
y_lm <- lm(fwd_7d_y ~ rev_7d_y)
## Rescale rev data
new_rev_x <- (activity_df_full[which(activity_df_full$Direction == "rev" ),"x"] *
unname(x_lm$coefficients["rev_7d_x"])) +
unname(x_lm$coefficients["(Intercept)"])
new_rev_y <- (activity_df_full[which(activity_df_full$Direction == "rev" ),"y"] *
unname(y_lm$coefficients["rev_7d_y"])) +
unname(y_lm$coefficients["(Intercept)"])
## Add to df
activity_df_full$transformed_x <- activity_df_full$x
activity_df_full[which(activity_df_full$Direction == "rev"),"transformed_x"] <- new_rev_x
activity_df_full$transformed_y <- activity_df_full$y
activity_df_full[which(activity_df_full$Direction == "rev"),"transformed_y"] <- new_rev_y
}
## Make centroid df
centroid_df <- aggregate(activity_df_full[,c("transformed_x","transformed_y")], list(Time=activity_df_full$Time), mean)
## Plot centroids
rf <- colorRampPalette(rev(brewer.pal(11,'Spectral')))
plot_color <- rf(20)
image <- ggplot(centroid_df, aes(x=transformed_x,y=transformed_y)) +
geom_point(mapping=aes(size=1, color=Time)) +
geom_path(aes(alpha=0.5), arrow = arrow(), linetype="dashed") +
xlab(genes[1]) +
ylab(genes[2]) +
scale_size(range=c(1.75, 3)) +
scale_shape_manual(values=c(16,17,3,4,8,18,15,13)) +
guides(alpha=FALSE, size=FALSE, color=FALSE) +
scale_color_manual(values=plot_color) +
ggtheme() +
ggtitle(paste(c," ",t))
trans_centroid_plotlist[[iter]] <- image
iter <- iter+1
}
else {
trans_centroid_plotlist[[iter]] <- dummy_plot
iter <- iter+1
}
}
}
title <- paste0(output_dir,genes[1],"_",genes[2],"_activity_centroids_SCALED.pdf")
pdf(file = title, width = 22, height = 17)
multiplot(plotlist = trans_centroid_plotlist, cols = 4)
dev.off()
}
We used the DATF regulons to identify Tf-target pairs for all the experimental conditions.
allRegulons <- list()
for(tCounter in seq_along(treatments)){
for(cCounter in seq_along(cancer_types)){
for(dCounter in seq_along(direction)){
## Select a treatment to work with (or iterate thru t in a loop)
t <- treatments[tCounter]
c <- cancer_types[cCounter]
direc <- direction[dCounter]
treatmentName <- paste0(c,"_",t,"_",direc)
output_dir <- paste0("output/auc/")
regulon_fname <- paste0("EmtRds/SCENIC/results/regulons_",c,"_",t,"_",direc,".json")
regs <- read_json(regulon_fname, simplify = T)
network <- data.frame(Source = character(), Target = character())
for(src in seq_along(regs)){
targets <- unlist(regs[src])
networkTmp <- data.frame(Source = rep(names(regs)[src],length(targets)), Target = targets)
network <- rbind(network, networkTmp)
}
network$Dataset <- treatmentName
allRegulons[[treatmentName]] <- network
}
}
}
saveRDS(allRegulons, file = "allRegulons.rds")
First we defined a function to calculate mutual information matrix using the activity matrix.
library(infotheo)
#' @param data expression matrix. Genes in rows and samples/cells in columns
#' @param nbins number of bins
#' @value A mutual information matrix
calculateMI <- function(data = actMat){
data <- t(data)
nGenes <- dim(data)[2]
miMat <- matrix(0,nrow = nGenes,ncol = nGenes)
geneNames <- colnames(data)
rownames(miMat) <- geneNames
colnames(miMat) <- geneNames
data <- infotheo::discretize(data)
#i=1
for(i in 1:(nGenes-1))
{
for(j in (i+1):(nGenes))
{
gene1 <- geneNames[i]
gene2 <- geneNames[j]
miMat[gene1, gene2] <- infotheo::mutinformation(data[,gene1], data[,gene2], method = "mm")
miMat[gene2, gene1] <- miMat[gene1, gene2]
}
}
return(miMat)
}
Next, we identified the networks for each treatment using the regulons from SCENIC and added the mutual information and correlation for each interaction.
allRegulons <- readRDS("allRegulons.rds")
coreTf <- readRDS("coreTf.rds")
fullNetworkList <- list()
counter <- 1
for(tCounter in seq_along(treatments)){
for(cCounter in seq_along(cancer_types)){
for(dCounter in seq_along(direction)){
t <- treatments[tCounter]
c <- cancer_types[cCounter]
direc <- direction[dCounter]
treatmentName <- paste0(c,"_",t,"_",direc)
print(treatmentName)
tfInNetwork <- coreTf
tfInNetwork <- substr(tfInNetwork, 0, nchar(as.character(tfInNetwork)) -3)
geneNetwork <- data.frame(allRegulons[treatmentName])
colnames(geneNetwork) <- c("Source", "Target", "Dataset")
geneNetwork$Source <- substr(geneNetwork$Source, 0, nchar(as.character(geneNetwork$Source)) -3)
# Include any interactions where TFs from core network are source or targets
coreInteractions <- geneNetwork[which(geneNetwork$Source %in% tfInNetwork),]
coreInteractionsTmp <- geneNetwork[which(geneNetwork$Target %in% tfInNetwork),]
coreInteractions <- rbind(coreInteractions, coreInteractionsTmp)
colnames(coreInteractions) <- c("Source", "Target", "Dataset")
coreInteractions <- coreInteractions[!duplicated(coreInteractions),]
seurat_regs <- readRDS(paste0("output/auc/seurat_regs",c,"_",t,"_",direc,".Rds"))
networkGenes <- union(coreInteractions$Source, coreInteractions$Target)
tmp <- seurat_regs@assays$RNA@scale.data
regNames <- substr(rownames(tmp), 0, nchar(as.character(rownames(tmp))) -3)
networkGenes <- networkGenes[which(networkGenes %in% regNames)]
coreInteractions <- coreInteractions[which(coreInteractions$Source %in% networkGenes),]
coreInteractions <- coreInteractions[which(coreInteractions$Target %in% networkGenes),]
networkGenes <- union(coreInteractions$Source, coreInteractions$Target)
rownames(tmp) <- regNames
tmp <- tmp[which(rownames(tmp) %in% networkGenes),]
tmp <- t(tmp)
actMi <- calculateMI(actMat = tmp)
mi <- integer(length = length(coreInteractions$Source))
for(int in seq_along(coreInteractions$Source)){
mi[int] <- actMi[as.character(coreInteractions$Source[int]), as.character(coreInteractions$Target[int])]
}
coreInteractions$Mi <- mi
actCor <- cor(tmp, method = "s")
type <- integer(length = length(coreInteractions$Source))
for(int in seq_along(coreInteractions$Source)){
type[int] <- actCor[as.character(coreInteractions$Source[int]), as.character(coreInteractions$Target[int])]
}
coreInteractions$Cor <- type
type[type>0] <- 1
type[type<=0] <- 2
coreInteractions$Type <- type
coreInteractions$Dataset <- treatmentName
# }
fullNetworkList[[treatmentName]] <- coreInteractions
counter <- counter+1
}
}
}
saveRDS(fullNetworkList, file = "fullNetworkList_top23RegAllRegAllInt.rds")
srcGenes <- lapply(fullNetworkList, function(x)x$Source)
tgtGenes <- lapply(fullNetworkList, function(x)x$Target)
dataset <- lapply(fullNetworkList, function(x)x$Dataset)
types <- lapply(fullNetworkList, function(x)x$Type)
actCor <- lapply(fullNetworkList, function(x)x$Cor)
mi <- lapply(fullNetworkList, function(x)x$Mi)
cellLine <- str_match(unlist(dataset), "(.*?)_")[,2]
signal <- str_match(unlist(dataset), "_(.*?)_")[,2]
allNetworkFull <- data.frame(Source = unlist(srcGenes), Target = unlist(tgtGenes), Type = unlist(types), Dataset = unlist(dataset), Cor = unlist(actCor), Mi = unlist(mi), CellLine = cellLine,Signal = signal, stringsAsFactors = FALSE)
saveRDS(allNetworkFull, "allNetworkFullCoreMiTop23AllRegAllInt.rds")
We defined a function to filter out interactions and simulated the network.
library(sRACIPE)
library(stringr)
createNetworkMiValue <- function(tmpNetwork=tmpNetwork,posMi = posMi,
negMi = negMi, name = "A549", simulate = FALSE){
tmpNetwork <- tmpNetwork[order(tmpNetwork$Mi, decreasing = TRUE),]
tmpNetwork <- tmpNetwork[!duplicated(tmpNetwork[c("Source","Target","Type")]),]
posNetwork <- tmpNetwork[tmpNetwork$Cor >0,]
posNetwork <- posNetwork[which(posNetwork$Mi > posMi),]
negNetwork <- tmpNetwork[tmpNetwork$Cor < 0,]
negNetwork <- negNetwork[which(negNetwork$Mi > negMi),]
if(dim(posNetwork)[1] == 0 | dim(negNetwork)[1] == 0) return()
tmpNetwork <- rbind(posNetwork,negNetwork)
sum(duplicated(tmpNetwork[,c("Source","Target")]))
sum(duplicated(tmpNetwork[,c("Source","Target","Type")]))
tmpNetwork <- tmpNetwork[!duplicated(tmpNetwork[,c("Source","Target","Type")]),]
tmpNetwork[duplicated(tmpNetwork[,c("Source","Target")]),]
tmpNetwork[duplicated(tmpNetwork[,c("Source","Target")], fromLast = TRUE),]
delRows <- anyDuplicated(tmpNetwork[,c("Source","Target")])
delRows <- c(delRows, anyDuplicated(tmpNetwork[,c("Source","Target")], fromLast = TRUE))
delRows <- delRows[delRows>0]
if(length(delRows>0)){ tmpNetwork <- tmpNetwork[-delRows,]}
interactionRemoved = length(tmpNetwork$Source)
# If signal nodes are to be removed iteratively, uncomment the next line
# while(interactionRemoved>0)
{
tmpVar = length(tmpNetwork$Source)
tmpNetwork <- tmpNetwork[(which(tmpNetwork$Source %in% tmpNetwork$Target )),]
# tmpNetwork <- tmpNetwork[which(tmpNetwork$Target %in% tmpNetwork$Source),]
interactionRemoved = tmpVar - length(tmpNetwork$Source)
}
tmpNetwork <- tmpNetwork[!duplicated(tmpNetwork),]
message(length(tmpNetwork$Source))
networkTfs <- union(tmpNetwork$Source,tmpNetwork$Target)
coreTfGenes <- substr(coreTf,0,nchar(coreTf)-3)
print(networkTfs[which(networkTfs %in% coreTfGenes)])
if(length(union(tmpNetwork$Source,tmpNetwork$Target)) <5) simulate = FALSE
# if(length(which(tmpNetwork$Type ==2)) <2) simulate = FALSE
# if(length(which(tmpNetwork$Type ==1)) <2) simulate = FALSE
if(simulate){
tmp <- sracipeSimulate(circuit = tmpNetwork[,c("Source", "Target", "Type")], plotToFile = TRUE,
numModels = 2000, plots = FALSE, stepper = "RK4", integrateStepSize = 0.02)
annotation(tmp) <- paste0(name,"_",as.character(posMi), "_",as.character(negMi),"Interactions")
# tmp <- sracipePlotData(tmp, plotToFile = T)
saveRDS(tmp, file = paste0("simAll/",name,"_",as.character(posMi), "_",as.character(negMi),".rds"))
}
# return(tmpNetwork)
}
The simulated data can be plotted and visualized.
tmp <- readRDS(paste0("simAll/","A549_TNF_0.2_0.3.rds"))
tmp <- sracipePlotData(tmp)
Next we varied the positive and negative mutual information cutoffs to simulate a large number of networks.
coreTf <- readRDS(("coreTf.rds"))
allNetworkFull <- readRDS("allNetworkFullCoreMiTop23AllRegAllInt.rds")
for(posMi in seq(1,0.05,-0.05)){
for(negMi in seq(0.5,0.05,-0.05)){
cellLine = "DU145"
treatment <- "TNF"
print(paste0(cellLine,"_",treatment,"_",as.character(posMi),"_",as.character(negMi)))
tmpNetwork <- allNetworkFull[(allNetworkFull$CellLine == cellLine & allNetworkFull$Signal == treatment),]
createNetworkMiValue(tmpNetwork,posMi=posMi,negMi = negMi,
name=paste0(cellLine,"_",treatment), simulate = TRUE)
}
}
library(igraph)
options(stringsAsFactors = FALSE)
allNetworkFull <- readRDS("allNetworkFullCoreMiTop23AllRegAllInt.rds")
allGenes <- union(allNetworkFull$Source,allNetworkFull$Target)
degs <- readRDS(file = "output/network_construction/degRegsFwd0dRev7d.rds")
degs$Signal <- str_match(degs$Comparison, "_(.*?)_")[,2]
counter <- 1
treatmentNames <- c()
for(tCounter in seq_along(treatments)){
for(cCounter in seq_along(cancer_types)){
for(dCounter in seq_along(direction)){
t <- treatments[tCounter]
c <- cancer_types[cCounter]
direc <- direction[dCounter]
treatmentNames <- c(treatmentNames,paste0(c,"_",t,"_",direc))
}
}
}
EMClass <- matrix(data = NA,ncol = length(treatmentNames),nrow = length(allGenes),dimnames = list(allGenes, treatmentNames))
for(tCounter in seq_along(treatmentNames)){
# tCounter = 1
degTmp <- degs[which((degs$Comparison == treatmentNames[tCounter]) & (degs$Time == "7d") & (degs$p_val_adj < 0.05)),]
Mgenes <- degTmp$gene[which(degTmp$avg_logFC>0)]
Mgenes <- substr(Mgenes,0,nchar(Mgenes)-3)
Mgenes <- Mgenes[which(Mgenes %in% allGenes)]
Egenes <- degTmp$gene[which(degTmp$avg_logFC<0)]
Egenes <- substr(Egenes,0,nchar(Egenes)-3)
Egenes <- Egenes[which(Egenes %in% allGenes)]
EMClass[Egenes,treatmentNames[tCounter]] <- FALSE
EMClass[Mgenes,treatmentNames[tCounter]] <- TRUE
degTmp <- degs[which((degs$Comparison == treatmentNames[tCounter]) & (degs$Time == "3d_rm") & (degs$p_val_adj < 0.05)),]
Mgenes <- degTmp$gene[which(degTmp$avg_logFC<0)]
Mgenes <- substr(Mgenes,0,nchar(Mgenes)-3)
Mgenes <- Mgenes[which(Mgenes %in% allGenes)]
Egenes <- degTmp$gene[which(degTmp$avg_logFC>0)]
Egenes <- substr(Egenes,0,nchar(Egenes)-3)
Egenes <- Egenes[which(Egenes %in% allGenes)]
EMClass[Egenes,treatmentNames[tCounter]] <- FALSE
EMClass[Mgenes,treatmentNames[tCounter]] <- TRUE
}
saveRDS(EMClass, file = "EMClassP05.rds")
coreGenes <- substr(coreTf, 0, nchar(coreTf)-3)
counter <- 1
conditionNames <- c()
for(tCounter in seq_along(treatments)){
for(cCounter in seq_along(cancer_types)){
t <- treatments[tCounter]
c <- cancer_types[cCounter]
conditionNames <- c(conditionNames,paste0(c,"_",t))
}
}
EMClassCondition <- matrix(data = NA,ncol = length(conditionNames),nrow = length(allGenes),dimnames = list(allGenes, conditionNames))
for(tCounter in seq_along(conditionNames)){
for(geneCounter in seq_along(allGenes)){
if(is.na(EMClass[allGenes[geneCounter],2*tCounter-1])){
if(!is.na(EMClass[allGenes[geneCounter],2*tCounter])){
EMClassCondition[allGenes[geneCounter],conditionNames[tCounter]] <- EMClass[allGenes[geneCounter],2*tCounter]
}
}
if(!is.na(EMClass[allGenes[geneCounter],2*tCounter-1])){
if(is.na(EMClass[allGenes[geneCounter],2*tCounter])){
EMClassCondition[allGenes[geneCounter],conditionNames[tCounter]] <- EMClass[allGenes[geneCounter],2*tCounter-1]
}
if(!is.na(EMClass[allGenes[geneCounter],2*tCounter])){
if(EMClass[allGenes[geneCounter],2*tCounter-1] == EMClass[allGenes[geneCounter],2*tCounter]){
EMClassCondition[allGenes[geneCounter],conditionNames[tCounter]] <- EMClass[allGenes[geneCounter],2*tCounter-1]}
if(!(EMClass[allGenes[geneCounter],2*tCounter-1] == EMClass[allGenes[geneCounter],2*tCounter])){
EMClassCondition[allGenes[geneCounter],conditionNames[tCounter]] <- NA}
}
}
}
}
saveRDS(EMClassCondition, "EMClassCondition.rds")
EMClassCondition <- readRDS("EMClassCondition.rds")
fileNames <- list.files("simAll/")
networkMetrics <- data.frame(Nodes=integer(),Interactions=integer(),PosInt = integer(),EModels = integer(), MModels = integer(),Connected = logical(),MGenes = integer(), Egenes = integer(), ModelCount = integer(), CellLine = character(), Signal = character(), Dataset = character(), transitivity = numeric(), MeanDistance = numeric(), Accuracy = numeric(), Filename = character(), MiPos = numeric(), MiNeg = numeric())
hammingTable <- c(0,7,12,17,22,28,35,42,49,56,64,71,79)
for(fileCounter in seq_along(fileNames)){
fileNameTmp <- fileNames[fileCounter]
print(fileNameTmp)
cellLine <- str_match(fileNameTmp, "(.*?)_")[,2]
signal <- str_match(fileNameTmp, "_(.*?)_")[,2]
posMi <- as.numeric(gsub("^(?:[^_]+_){2}([^_]+).*", "\\1", fileNameTmp))
negMi <- gsub("^(?:[^_]+_){3}([^_]+).*", "\\1", fileNameTmp)
negMi <- as.numeric(substr(negMi,0,nchar(negMi)-4))
condition <- paste0(cellLine, "_",signal)
racipe <- readRDS(paste0("simAll/",fileNameTmp))
# sracipePlotCircuit(racipe,plotToFile = FALSE)
racipe <- sracipeNormalize(racipe)
expData <- assay(racipe,1)
EMGenes <- EMClassCondition[,condition]
MState <- EMGenes[!is.na(EMGenes)]
circuit <- as.data.frame(sracipeCircuit(racipe))
g <- graph_from_data_frame(circuit, directed = TRUE, vertices = NULL)
commonGenes <- rownames(expData)
commonGenes <- commonGenes[which(commonGenes %in% names(MState))]
expData <- expData[commonGenes,]
expData <- Binarize::binarizeMatrix(expData, method = "kMeans")[,1:dim(expData)[2]]
MState <- MState[commonGenes]
EState <- !MState
expDataTmp <- as.matrix(expData)
storage.mode(expDataTmp) <- "logical"
if(length(MState)>85) message("Extend hamming table cutoffs")
hammingCutoff <- findInterval(length(MState),hammingTable)
hamDistE <- apply((expDataTmp), 2, function(x) sum(x != EState))
hamDistM <- apply((expDataTmp), 2, function(x) sum(x != MState))
networkMetricsTmp <- data.frame(Nodes= length(union(circuit$Source,circuit$Target)),Interactions=length(circuit$Source),PosInt = length(which(circuit$Type ==1)),EModels = length(which(hamDistE < hammingCutoff)), MModels = length(which(hamDistM < hammingCutoff)),Connected = igraph::is_connected(g),MGenes = sum(MState), Egenes = sum(EState), ModelCount = sracipeConfig(racipe)$simParams["numModels"], CellLine = cellLine, Signal = signal, Dataset = condition, Transitivity = igraph::transitivity(g),MeanDistance = igraph::mean_distance(g, directed = TRUE, unconnected = FALSE), Filename=fileNameTmp, MiPos = posMi, MiNeg = negMi)
networkMetricsTmp$Accuracy <- sum(networkMetricsTmp$EModels + networkMetricsTmp$MModels)/networkMetricsTmp$ModelCount
rownames(networkMetricsTmp) <- condition
networkMetrics <- rbind(networkMetrics, networkMetricsTmp)
}
saveRDS(networkMetrics, file = "networkMetrics.rds")
The network metrics can be visualized using ggplot.
networkMetrics <- networkMetrics[networkMetrics$Connected,]
networkMetrics <- networkMetrics[which((networkMetrics$MGenes + networkMetrics$Egenes)>4),]
ggplot(networkMetrics) +
geom_point(aes(x=Nodes,y=Accuracy), alpha=0.5, size = 0.5) +
theme_bw()+
xlab("Number of TFs") +
# facet_wrap(~Dataset, ncol = 3) +
theme(text=element_text(size=20),
axis.text.x=element_text(angle=90, hjust=1))
ggplot(networkMetrics) +
geom_point(aes(x=Interactions,y=Accuracy), alpha=0.5, size = 0.5) +
theme_bw()+
xlab("Number of Interactions") +
# facet_wrap(~Dataset, ncol = 3) +
theme(text=element_text(size=20),
axis.text.x=element_text(angle=90, hjust=1))
plotData <- networkMetrics
plotData$EMNodes <- (networkMetrics$MGenes + networkMetrics$Egenes)/networkMetrics$Nodes
ggplot(plotData) +
geom_box(aes(x=EMNodes ,y=Accuracy), alpha=0.5, size = 0.5) +
theme_bw()+
xlab("Fraction of EM TFs") +
# facet_wrap(~Dataset, ncol = 3) +
theme(text=element_text(size=20),
axis.text.x=element_text(angle=90, hjust=1))
plotData <- networkMetrics
plotData <- plotData[plotData$Connected == TRUE,]
plotData$DiffFraction <- (plotData$Interactions - plotData$PosInt)/plotData$Interactions
ggplot(plotData, aes(x=Interactions,y=Accuracy)) +
geom_point() +
xlab("Number of Interactions") +
theme_bw() +
theme(text=element_text(size=20)
)
ggplot(plotData, aes(x=Interactions,y=Accuracy)) +
geom_point() +
xlab("Number of Interactions") +
theme_bw() +
theme(text=element_text(size=20)
)
plotData$EMFraction <- (plotData$Egenes+ plotData$MGenes)/plotData$Nodes
ggplot(plotData, aes(x=plotData$EMFraction,y=Accuracy)) +
geom_point() +
xlab("Fraction of EM TFs") +
theme_bw() +
theme(text=element_text(size=20)
)
ggplot(plotData, aes(x=plotData$EMFraction,y=Accuracy)) +
geom_point() +
xlab("Fraction of EM TFs") +
theme_bw() +
facet_wrap(~Dataset) +
theme(text=element_text(size=20)
)
ggplot(plotData, aes(x=EMFraction,y=Accuracy)) +
geom_point() +
xlab("Fraction of EM TFs") +
theme_bw() +
facet_wrap(~Dataset) +
theme(text=element_text(size=20)
)
plotData$posIntFrac <- plotData$PosInt/plotData$Interactions
ggplot(plotData, aes(x=posIntFrac,y=Accuracy)) +
geom_point() +
xlab("Fraction of Excitatory Interactions") +
theme_bw() +
facet_wrap(~Dataset) +
theme(text=element_text(size=20)
)
plotData$NegIntFrac <- 1- plotData$posIntFrac
ggplot(plotData, aes(x=NegIntFrac,y=Accuracy)) +
geom_point() +
xlab("Fraction of Inhibitory Interactions") +
theme_bw() +
facet_wrap(~Dataset) +
theme(text=element_text(size=20)
)
ggplot(plotData, aes(x=plotData$EMFraction,y=Accuracy)) +
geom_point() +
xlab("Fraction of EM TFs") +
theme_bw() +
facet_wrap(~Dataset) +
theme(text=element_text(size=20)
)
We can plot the experimental and simulated datasets together to visualize whether the simulated data capture the TF activity derived from the experimental data. Here we show the results for the data from OVCA420 cell line with TGFB1 treatment and a network constructed using 0.4 and 0.1 mutual information cutoffs for excitatory and inhibitory interactions.
# Read the simulated data from the simulations above.
expRacipe <- readRDS(paste0("simAll/","OVCA420_TGFB1_0.4_0.1.rds"))
circuit <- sracipeCircuit(expRacipe)
networkTFs <- union(circuit$Source,circuit$Target)
expRacipeNorm <- sracipeNormalize(expRacipe)
expData <- assay(expRacipeNorm,1)
expDataOrig <- assay(expRacipe,1)
expDataOrig <- log2(1+expDataOrig)
expDataOrig <- expDataOrig[commonTfs,]
tmpMeans <- rowMeans(expDataOrig)
tmpSds <- apply(expDataOrig,1,sd)
seuratFwd <- readRDS(file = "output/auc/seurat_regsOVCA420_TGFB1_fwd.Rds")
actFwd <- seuratFwd@assays$RNA@scale.data
tfNames <- rownames(actFwd)
tfNames <- substr(tfNames, 0, nchar(tfNames)-3)
rownames(actFwd) <- substr(rownames(actFwd), 0, nchar(rownames(actFwd))-3)
commonTfs <- intersect(networkTFs,rownames(actFwd))
seuratRev <- readRDS(file = "output/auc/seurat_regsOVCA420_TGFB1_rev.Rds")
actRev <- seuratRev@assays$RNA@scale.data
tfNames <- rownames(actRev)
tfNames <- substr(tfNames, 0, nchar(tfNames)-3)
rownames(actRev) <- substr(rownames(actRev), 0, nchar(rownames(actRev))-3)
commonTfs <- intersect(commonTfs,rownames(actRev))
expData <- expData[commonTfs,]
actFwd <- actFwd[commonTfs,]
actRev <- actRev[commonTfs,]
plotColor <- c("#5E4FA2", "#4F61AA", "#4173B3", "#3386BC", "#4198B6",
"#51ABAE", "#62BEA6", "#77C8A4", "#8ED1A4", "#A4DAA4",
"#B8E2A1", "#CBEA9D", "#DEF199", "#EAF69F", "#F2FAAC",
"#FAFDB8", "#FEFAB6", "#FEF0A5", "#FEE695", "#FDD985",
"#FDC978", "#FDB96A", "#FCA75E", "#F99254", "#F67D4A",
"#F26943", "#E85A47", "#DE4B4B", "#D33C4E", "#C1284A",
"#AF1446", "#9E0142")
pca = prcomp(t(expData), scale. = T, center = T)
pcaData <- pca$x
pcaDataFwd <- (scale(t(actFwd), pca$center, pca$scale) %*%
pca$rotation)
pcaDataFwd <- data.frame(PC1=pcaDataFwd[,1],PC2=pcaDataFwd[,2], Time = seuratFwd$Time)
pcaDataRev <- (scale(t(actRev), pca$center, pca$scale) %*%
pca$rotation)
pcaDataRev <- data.frame(PC1=pcaDataRev[,1],PC2=pcaDataRev[,2], Time = seuratRev$Time)
expDataSim <- data.frame(pca$x[,1:2])
expDataSim$Time <- "Sim"
pcaDataFwd$Dir <- "Fwd"
pcaDataRev$Dir <- "Rev"
expDataSim$Dir <- "None"
# Fit a linear model to activities from cells from day 7 from forward and
# reverse directions.
sevenD <- rownames(pcaDataFwd[which(pcaDataFwd$Time == "7d"),])
sevenDFwd <- pcaDataFwd[sevenD,]$PC1
sevenDRev <- pcaDataRev[sevenD,]$PC1
pred <- pcaDataRev$PC1
pcaDataRevScaled <- data.frame(PC1 = predict(lm(sevenDFwd ~ sevenDRev),data.frame(sevenDRev = pred)))
sevenDFwd <- pcaDataFwd[sevenD,]$PC2
sevenDRev <- pcaDataRev[sevenD,]$PC2
pred <- pcaDataRev$PC2
pcaDataRevScaled$PC2 <- predict(lm(sevenDFwd ~ sevenDRev),data.frame(sevenDRev = pred))
pcaDataRevScaled$Time <- pcaDataRev$Time
pcaDataRevScaled$Dir <- pcaDataRev$Dir
allData <- rbind(expDataSim, pcaDataFwd,pcaDataRevScaled)
allData <- rbind(pcaDataFwd,pcaDataRevScaled)
allData <- (pcaDataRevScaled)
# allData <- (pcaDataRev)
# allData <- pcaDataFwd
allData <- expDataSim
p <- ggplot2::ggplot(allData) +
# geom_point(aes(x = allData[,1], y =allData[,2], color = Time, shape = Dir), alpha = 0.5) +
xlab(paste0("PC1(",100*summary(pca)$importance[2,1],"%)")) +
ylab(paste0("PC2(",100*summary(pca)$importance[2,2],"%)")) +
# stat_summary(aes(x = allData[,1], y =allData[,2], group=Dir, color = Dir), fun.y=mean, geom="line") +
geom_point(aes(x = allData[,1], y =allData[,2], group=Dir, color = Time, shape = Dir), alpha = 0.5) +
theme_bw() +
theme(text = element_text(size=15)) +
xlim(-5,11) +
ylim(-5,4)
NULL
We can modify the parameters for selected models and simulate the circuit with modified parameters. Here, we increased the production rate of JUN and RELB and used the steady state values from unperturbed simulations as the initial conditions for new stochastic simulations. For more details, please see here.
library(sRACIPE)
library(ggplot2)
newRacipe <- readRDS("eModelsRacipe.rds")
tmp <- sracipeParams(newRacipe)
tmp[,"G_JUN"] <- 10000*tmp[,"G_JUN"]
tmp[,"G_RELB"] <- 10000*tmp[,"G_RELB"]
sracipeParams(newRacipe) <- tmp
tmp <- assay(newRacipe,1)
sracipeIC(newRacipe) <- (tmp)
set.seed(123)
newRacipe2_D05_T2000 <- sracipeSimulate(
newRacipe, numModels = 1543, initialNoise = 0.05,simDet = FALSE,
nNoise=0,stepper = "EM", plots = FALSE,integrateStepSize = 0.01,
genParams = FALSE,genIC = FALSE, simulationTime = 2000)
saveRDS(newRacipe2_D05_T2000, file = "eModelsRacipe_D05_T2000.rds")
# We can instead also record the state of the models at multiple time points,
# for example, here we record the state at every 0.02 time.
newRacipeTime <- sracipeSimulate(
newRacipe, numModels = 1543, initialNoise = 0.05,simDet = FALSE,
nNoise=0,stepper = "EM", plots = FALSE,integrateStepSize = 0.01,
genParams = FALSE,genIC = FALSE, simulationTime = 50, printInterval = 0.02, printStart = 0)
saveRDS(newRacipeTime, file = "eModelsRacipe_D05_T50_interval02.rds")