Source code for PyWGCNA.comparison

import math
import sys

import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
import networkx as nx
from matplotlib import cm
from matplotlib.colors import ListedColormap
from matplotlib.lines import Line2D

# bcolors
import PyWGCNA

HEADER = "\033[95m"
OKBLUE = "\033[94m"
OKCYAN = "\033[96m"
OKGREEN = "\033[92m"
WARNING = "\033[93m"
FAIL = "\033[91m"
ENDC = "\033[0m"
BOLD = "\033[1m"
UNDERLINE = "\033[4m"


[docs]class Comparison: """ A class used to compare PyWGCNA to another PyWGCNA or any gene marker table :param geneModules: gene modules of networks :type geneModules: dict :param jaccard_similarity: jaccard similarity of common genes between each modules :type jaccard_similarity: pandas dataframe :param P_value: P value of common genes between each modules :type P_value: pandas dataframe :param fraction: fraction of common genes between each modules :type fraction: pandas dataframe """ def __init__(self, geneModules=None): self.geneModules = geneModules self.jaccard_similarity = None self.P_value = None self.fraction = None
[docs] def calculateJaccardSimilarity(self): """ Calculate jaccard similarity matrix along multiple networks :return: dataframe containing jaccard similarity between all modules in all PyWGCNA objects :rtype: pandas dataframe """ num = 0 names = [] for network in self.geneModules.keys(): num = num + len(self.geneModules[network].moduleColors.unique()) tmp = [f"{network}:" + s for s in self.geneModules[network].moduleColors.unique().tolist()] names = names + tmp jaccard_similarity = pd.DataFrame(0.0, columns=names, index=names) for network1 in self.geneModules.keys(): for network2 in self.geneModules.keys(): if network1 != network2: modules1 = self.geneModules[network1].moduleColors.unique().tolist() modules2 = self.geneModules[network2].moduleColors.unique().tolist() for module1 in modules1: for module2 in modules2: list1 = self.geneModules[network1].index[ self.geneModules[network1].moduleColors == module1].tolist() list2 = self.geneModules[network2].index[ self.geneModules[network2].moduleColors == module2].tolist() jaccard_similarity.loc[ f"{network1}:{module1}", f"{network2}:{module2}"] = PyWGCNA.Comparison.jaccard(list1, list2) else: modules = self.geneModules[network1].moduleColors.unique().tolist() for module in modules: jaccard_similarity.loc[f"{network1}:{module}", f"{network1}:{module}"] = 1.0 self.jaccard_similarity = jaccard_similarity return jaccard_similarity
[docs] def calculateFraction(self): """ Calculate common fraction along multiple networks :return: dataframe containing fraction between all modules in all netwroks :rtype: pandas dataframe """ num = 0 names = [] for network in self.geneModules.keys(): num = num + len(self.geneModules[network].moduleColors.unique()) tmp = [f"{network}:" + s for s in self.geneModules[network].moduleColors.unique().tolist()] names = names + tmp fraction = pd.DataFrame(0, columns=names, index=names) for network1 in self.geneModules.keys(): for network2 in self.geneModules.keys(): if network1 != network2: modules1 = self.geneModules[network1].moduleColors.unique().tolist() modules2 = self.geneModules[network2].moduleColors.unique().tolist() for module1 in modules1: for module2 in modules2: list1 = self.geneModules[network1].index[ self.geneModules[network1].moduleColors == module1].tolist() list2 = self.geneModules[network2].index[ self.geneModules[network2].moduleColors == module2].tolist() num = np.intersect1d(list1, list2) fraction.loc[f"{network1}:{module1}", f"{network2}:{module2}"] = len(num) / len(list2) * 100 else: modules = self.geneModules[network1].moduleColors.unique().tolist() for module in modules: fraction.loc[f"{network1}:{module}", f"{network1}:{module}"] = 1.0 self.fraction = fraction return fraction
[docs] def calculatePvalue(self): """ Calculate pvalue of fraction along multiple networks :return: dataframe containing pvalue between all modules in all netwroks :rtype: pandas dataframe """ num = 0 names = [] for network in self.geneModules.keys(): num = num + len(self.geneModules[network].moduleColors.unique()) tmp = [f"{network}:" + s for s in self.geneModules[network].moduleColors.unique().tolist()] names = names + tmp pvalue = pd.DataFrame(0, columns=names, index=names) genes = [] for network in self.geneModules.keys(): genes = genes + self.geneModules[network].index.tolist() genes = list(set(genes)) nGenes = len(genes) for network1 in self.geneModules.keys(): for network2 in self.geneModules.keys(): if network1 != network2: modules1 = self.geneModules[network1].moduleColors.unique().tolist() modules2 = self.geneModules[network2].moduleColors.unique().tolist() for module1 in modules1: for module2 in modules2: list1 = self.geneModules[network1].index[ self.geneModules[network1].moduleColors == module1].tolist() list2 = self.geneModules[network2].index[ self.geneModules[network2].moduleColors == module2].tolist() number = self.fraction.loc[f"{network1}:{module1}", f"{network2}:{module2}"] * len( list2) / 100 table = np.array( [[nGenes - len(list1) - len(list2) + number, len(list1) - number], [len(list2) - number, number]]) oddsr, p = fisher_exact(table, alternative='two-sided') pvalue.loc[f"{network1}:{module1}", f"{network2}:{module2}"] = p self.P_value = pvalue return pvalue
[docs] def compareNetworks(self): """ compare Networks """ self.calculateJaccardSimilarity() self.calculateFraction() self.calculatePvalue()
[docs] def plotHeatmapComparison(self, color="jaccard_similarity", row_cluster=True, col_cluster=True, save=True, plot_show=True, plot_format="pdf", file_name="heatmap_comparison"): """ plot heatmap comparison :param color: how to color heatmap (options: jaccard_similarity or fraction) default: jaccard_similarity :type color: str :param row_cluster: If True, cluster the rows. (default True) :type row_cluster: bool :param col_cluster: If True, cluster the columns. (default True) :type col_cluster: bool :param save: if you want to save plot as comparison.png near to your script :type save: bool :param plot_show: indicate if you want to show the plot or not (default: True) :type plot_show: bool :param plot_format: indicate the format of plot (default: pdf) :type plot_format: str :param file_name: name and path of the plot use for save (default: heatmap_comparison) :type file_name: str """ if color == "jaccard_similarity": tmp1 = self.jaccard_similarity elif color == "fraction": tmp1 = self.fraction else: sys.exit("Color is not correct!") reds = cm.get_cmap('Reds', 256) newcolors = reds(np.linspace(0, 1, 256)) white = np.array([255 / 256, 255 / 256, 255 / 256, 1]) newcolors[:1, :] = white newcmp = ListedColormap(newcolors) np.fill_diagonal(tmp1.values, 0) labels = self.P_value.round(decimals=2) labels[tmp1 == 0] = "" labels = (np.asarray(["{0}".format(pvalue) for pvalue in labels.values.flatten()])) \ .reshape(self.jaccard_similarity.shape) sns.set(font_scale=1.5) res = sns.clustermap(tmp1, annot=labels, fmt="", cmap=newcmp, row_cluster=row_cluster, col_cluster=col_cluster, figsize=(tmp1.shape[0] + 3, tmp1.shape[1]), annot_kws={'size': 20, "weight": "bold"}) plt.setp(res.ax_heatmap.xaxis.get_majorticklabels(), fontsize=20, fontweight="bold", rotation=90) plt.setp(res.ax_heatmap.yaxis.get_majorticklabels(), fontsize=20, fontweight="bold") plt.yticks(rotation=0) res.fig.suptitle(f"comparison heatmap", fontsize=30, fontweight="bold") if save: res.savefig(f"{file_name}.{plot_format}") if plot_show: plt.show() else: plt.close()
[docs] def plotJaccardSimilarity(self, color=None, cutoff=0.1, figsize=None, save=True, plot_show=True, plot_format="png", file_name="jaccard_similarity"): """ Plot jaccard similarity matrix as a network :param color: if you want to color nodes for each networks separately :type color: dict :param cutoff: threshold you used for filtering jaccard similarity :type cutoff: double :param figsize: indicate the size of plot (default is base on the number of nodes that pass cutoff) :type figsize: tuple of int :param save: indicate if you want to save the plot or not (default: True) :type save: bool :param plot_show: indicate if you want to show the plot or not (default: True) :type plot_show: bool :param plot_format: indicate the format of plot (default: png) :type plot_format: str :param file_name: name and path of the plot use for save (default: jaccard_similarity) :type file_name: str """ df = self.jaccard_similarity np.fill_diagonal(df.values, 0) df = pd.DataFrame(df.stack()) df.reset_index(inplace=True) df = df[df[0] >= cutoff] df.columns = ['source', 'dest', 'weight'] if df.shape[0] == 0: print(f"{WARNING}None of the connections pass the cutoff{ENDC}") return None G = nx.from_pandas_edgelist(df, 'source', 'dest', 'weight') node_labels = {} nodes = list(G.nodes()) for i in range(len(nodes)): node_labels[nodes[i]] = nodes[i].split(":")[1] edges = G.edges() weights = [G[u][v]['weight'] * 10 for u, v in edges] edge_labels = {} for u, v in edges: edge_labels[u, v] = str(round(G[u][v]['weight'], 2)) color_map = [] if color is None: color_map = None else: for node in G: color_map.append(color[node.split(":")[0]]) if figsize is None: figsize = (len(G.nodes()) / 2, len(G.nodes()) / 2) fig, ax = plt.subplots(figsize=figsize, facecolor='white') pos = nx.spring_layout(G, k=1 / math.sqrt(len(G.nodes()) / 2)) nx.draw_networkx(G, pos=pos, node_color=color_map, width=weights, labels=node_labels, font_size=8, node_size=500, with_labels=True, ax=ax) nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=7) if color is not None: for label in color: ax.plot([0], [0], color=color[label], label=label) plt.legend() plt.tight_layout() if save: plt.savefig(f"{file_name}.{plot_format}") if plot_show: plt.show() else: plt.close()
[docs] def plotBubbleComparison(self, bubble_size="jaccard_similarity", cutoff=0.01, color=None, order1=None, order2=None, figsize=None, save=True, plot_show=True, plot_format="png", file_name="bubble_comparison"): """ plot comparison matrix as a bubble plot :param bubble_size: which information you want to use for size of bubble (options: jaccard_similarity or fraction) default: jaccard_similarity :type bubble_size: str :param cutoff: threshold you used for defining significant comparison :type cutoff: double :param color: if you want to color tick labels for each networks separately :type color: dict :param order1: order of modules in PyWGCNA1 you want to show in plot (name of each elements should mapped the name of modules in your first PyWGCNA) :type order1: list of str :param order2: order of modules in PyWGCNA2 you want to show in plot (name of each elements should mapped the name of modules in your second PyWGCNA) :type order2: list of str :param figsize: indicate the size of plot (default is base on the number of modules) :type figsize: tuple of int :param save: if you want to save plot as comparison.png near to your script :type save: bool :param save: indicate if you want to save the plot or not (default: True) :type save: bool :param plot_show: indicate if you want to show the plot or not (default: True) :type plot_show: bool :param plot_format: indicate the format of plot (default: png) :type plot_format: str :param file_name: name and path of the plot use for save (default: jaccard_similarity) :type file_name: str """ viridis = cm.get_cmap('viridis', 256) newcolors = viridis(np.linspace(0, 1, 256)) grey = np.array([128 / 256, 128 / 256, 128 / 256, 1]) newcolors[:round(256 * cutoff), :] = grey newcmp = ListedColormap(newcolors) P_value = self.P_value.copy(deep=True) P_value = -1 * np.log10(P_value.astype(np.float64)) if bubble_size == "jaccard_similarity": size = self.jaccard_similarity.copy(deep=True) elif bubble_size == "fraction": size = self.fraction.copy(deep=True) else: sys.exit("Color is not correct!") np.fill_diagonal(size.values, 0) P_value[size == 0] = np.nan P_value.replace([np.inf], -1, inplace=True) P_value[P_value == -1] = P_value.max(numeric_only=True).max() + 1 size[size == 0] = np.nan if order1 is not None: P_value = P_value.reindex(columns=order1) size = size.reindex(columns=order1) if order2 is not None: P_value = P_value.reindex(order2) size = size.reindex(order2) df = pd.DataFrame(size.stack()) df.reset_index(inplace=True) df.columns = ['x', 'y', 'size'] P_value = pd.DataFrame(P_value.stack()) P_value.reset_index(inplace=True) df['-log10(P_value)'] = P_value[0].values.tolist() if figsize is None: figsize = (df.shape[0] / 70 + df.shape[0] / 400, df.shape[0] / 70) fig, ax = plt.subplots(figsize=figsize, facecolor='white') ax = sns.scatterplot(data=df, x="x", y="y", hue="-log10(P_value)", size="size", palette=newcmp) norm = plt.Normalize(0, df['-log10(P_value)'].max()) sm = plt.cm.ScalarMappable(cmap="viridis", norm=norm) sm.set_array([]) fig.canvas.draw() if color is not None: xticks = [] yticks = [] for xtick, ytick in zip(ax.get_xticklabels(), ax.get_yticklabels()): tmp = xtick.get_text().split(":") xtick.set_color(color[tmp[0]]) xticks.append(tmp[1]) tmp = ytick.get_text().split(":") ytick.set_color(color[tmp[0]]) yticks.append(tmp[1]) ax.set_xticklabels(xticks, rotation=90) ax.set_yticklabels(yticks) else: ax.set_xticklabels(rotation=90) ax.set_xlabel('') ax.set_ylabel('') # Remove the legend and add a colorbar ax.get_legend().remove() ax.figure.colorbar(sm, shrink=0.25, label='-log10(P_value)') handles, labels = ax.get_legend_handles_labels() entries_to_skip = labels.index('size') + 1 handles = handles[entries_to_skip:] labels = labels[entries_to_skip:] for h in handles[1:]: sizes = [s for s in h.get_sizes()] h.set_sizes(sizes) labels = labels[:1] + [f'{float(lab):.2f}' for lab in labels[1:]] legend_size = ax.legend(handles, labels, title=bubble_size, bbox_to_anchor=(1.04, 1), loc=2, borderaxespad=0., frameon=False) plt.gca().add_artist(legend_size) legend_elements = [] if color is not None: for label in color: tmp = Line2D([0], [0], color=color[label], label=label) legend_elements.append(tmp) ax.legend(handles=legend_elements, title='networks', bbox_to_anchor=(1.04, 0), loc=3, borderaxespad=0., frameon=False) if save: plt.savefig(f"{file_name}.{plot_format}") if plot_show: plt.show() else: plt.close()
[docs] @staticmethod def jaccard(list1, list2): """ Calculate jaccard similarity matrix for two lists :param list1: first list containing the data :type list1: list :param list2: second list containing the data :type list2: list :return: jaccard similarity :rtype: double """ intersection = len(list(set(list1).intersection(list2))) union = (len(list1) + len(list2)) - intersection return float(intersection) / union
[docs] def saveComparison(self, name="comparison"): """ save comparison object as comparison.p near to the script :param name: name of the pickle file (default: comparison.p) :type name: str """ print(f"{BOLD}{OKBLUE}Saving comparison as {name}.p{ENDC}") picklefile = open(f"{name}.p", 'wb') pickle.dump(self, picklefile) picklefile.close()