Source code for PyWGCNA.comparison

import math
import sys

import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
import networkx as nx
from matplotlib import cm
from matplotlib.colors import ListedColormap
from matplotlib.lines import Line2D

# bcolors
import PyWGCNA

HEADER = "\033[95m"
OKBLUE = "\033[94m"
OKCYAN = "\033[96m"
OKGREEN = "\033[92m"
WARNING = "\033[93m"
FAIL = "\033[91m"
ENDC = "\033[0m"
BOLD = "\033[1m"
UNDERLINE = "\033[4m"


[docs]class Comparison:
    """
    A class used to compare PyWGCNA to another PyWGCNA or any gene marker table

    :param geneModules: gene modules of networks
    :type geneModules: dict
    :param jaccard_similarity: jaccard similarity of common genes between each modules
    :type jaccard_similarity: pandas dataframe
    :param P_value: P value of common genes between each modules
    :type P_value: pandas dataframe
    :param fraction: fraction of common genes between each modules
    :type fraction: pandas dataframe

    """

    def __init__(self, geneModules=None):
        self.geneModules = geneModules

        self.jaccard_similarity = None
        self.P_value = None
        self.fraction = None

[docs]    def calculateJaccardSimilarity(self):
        """
        Calculate jaccard similarity matrix along multiple networks

        :return: dataframe containing jaccard similarity between all modules in all PyWGCNA objects
        :rtype: pandas dataframe
        """
        num = 0
        names = []
        for network in self.geneModules.keys():
            num = num + len(self.geneModules[network].moduleColors.unique())
            tmp = [f"{network}:" + s for s in self.geneModules[network].moduleColors.unique().tolist()]
            names = names + tmp
        jaccard_similarity = pd.DataFrame(0.0, columns=names, index=names)

        for network1 in self.geneModules.keys():
            for network2 in self.geneModules.keys():
                if network1 != network2:
                    modules1 = self.geneModules[network1].moduleColors.unique().tolist()
                    modules2 = self.geneModules[network2].moduleColors.unique().tolist()
                    for module1 in modules1:
                        for module2 in modules2:
                            list1 = self.geneModules[network1].index[
                                self.geneModules[network1].moduleColors == module1].tolist()
                            list2 = self.geneModules[network2].index[
                                self.geneModules[network2].moduleColors == module2].tolist()
                            jaccard_similarity.loc[
                                f"{network1}:{module1}", f"{network2}:{module2}"] = PyWGCNA.Comparison.jaccard(list1,
                                                                                                               list2)
                else:
                    modules = self.geneModules[network1].moduleColors.unique().tolist()
                    for module in modules:
                        jaccard_similarity.loc[f"{network1}:{module}", f"{network1}:{module}"] = 1.0

        self.jaccard_similarity = jaccard_similarity

        return jaccard_similarity

[docs]    def calculateFraction(self):
        """
        Calculate common fraction along multiple networks

        :return: dataframe containing fraction between all modules in all netwroks
        :rtype: pandas dataframe
        """

        num = 0
        names = []
        for network in self.geneModules.keys():
            num = num + len(self.geneModules[network].moduleColors.unique())
            tmp = [f"{network}:" + s for s in self.geneModules[network].moduleColors.unique().tolist()]
            names = names + tmp
        fraction = pd.DataFrame(0, columns=names, index=names)

        for network1 in self.geneModules.keys():
            for network2 in self.geneModules.keys():
                if network1 != network2:
                    modules1 = self.geneModules[network1].moduleColors.unique().tolist()
                    modules2 = self.geneModules[network2].moduleColors.unique().tolist()
                    for module1 in modules1:
                        for module2 in modules2:
                            list1 = self.geneModules[network1].index[
                                self.geneModules[network1].moduleColors == module1].tolist()
                            list2 = self.geneModules[network2].index[
                                self.geneModules[network2].moduleColors == module2].tolist()
                            num = np.intersect1d(list1, list2)
                            fraction.loc[f"{network1}:{module1}", f"{network2}:{module2}"] = len(num) / len(list2) * 100
                else:
                    modules = self.geneModules[network1].moduleColors.unique().tolist()
                    for module in modules:
                        fraction.loc[f"{network1}:{module}", f"{network1}:{module}"] = 1.0
        self.fraction = fraction

        return fraction

[docs]    def calculatePvalue(self):
        """
        Calculate pvalue of fraction along multiple networks

        :return: dataframe containing pvalue between all modules in all netwroks
        :rtype: pandas dataframe
        """

        num = 0
        names = []
        for network in self.geneModules.keys():
            num = num + len(self.geneModules[network].moduleColors.unique())
            tmp = [f"{network}:" + s for s in self.geneModules[network].moduleColors.unique().tolist()]
            names = names + tmp
        pvalue = pd.DataFrame(0, columns=names, index=names)

        genes = []
        for network in self.geneModules.keys():
            genes = genes + self.geneModules[network].index.tolist()

        genes = list(set(genes))
        nGenes = len(genes)

        for network1 in self.geneModules.keys():
            for network2 in self.geneModules.keys():
                if network1 != network2:
                    modules1 = self.geneModules[network1].moduleColors.unique().tolist()
                    modules2 = self.geneModules[network2].moduleColors.unique().tolist()
                    for module1 in modules1:
                        for module2 in modules2:
                            list1 = self.geneModules[network1].index[
                                self.geneModules[network1].moduleColors == module1].tolist()
                            list2 = self.geneModules[network2].index[
                                self.geneModules[network2].moduleColors == module2].tolist()
                            number = self.fraction.loc[f"{network1}:{module1}", f"{network2}:{module2}"] * len(
                                list2) / 100
                            table = np.array(
                                [[nGenes - len(list1) - len(list2) + number, len(list1) - number],
                                 [len(list2) - number, number]])
                            oddsr, p = fisher_exact(table, alternative='two-sided')
                            pvalue.loc[f"{network1}:{module1}", f"{network2}:{module2}"] = p
        self.P_value = pvalue

        return pvalue

[docs]    def compareNetworks(self):
        """
        compare Networks
        """
        self.calculateJaccardSimilarity()
        self.calculateFraction()
        self.calculatePvalue()

[docs]    def plotHeatmapComparison(self,
                              color="jaccard_similarity",
                              row_cluster=True,
                              col_cluster=True,
                              save=True,
                              plot_show=True,
                              plot_format="pdf",
                              file_name="heatmap_comparison"):
        """
        plot heatmap comparison

        :param color: how to color heatmap (options: jaccard_similarity or fraction) default: jaccard_similarity
        :type color: str
        :param row_cluster: If True, cluster the rows. (default True)
        :type row_cluster: bool
        :param col_cluster: If True, cluster the columns. (default True)
        :type col_cluster: bool
        :param save: if you want to save plot as comparison.png near to your script
        :type save: bool
        :param plot_show: indicate if you want to show the plot or not (default: True)
        :type plot_show: bool
        :param plot_format: indicate the format of plot (default: pdf)
        :type plot_format: str
        :param file_name: name and path of the plot use for save (default: heatmap_comparison)
        :type file_name: str

        """
        if color == "jaccard_similarity":
            tmp1 = self.jaccard_similarity
        elif color == "fraction":
            tmp1 = self.fraction
        else:
            sys.exit("Color is not correct!")

        reds = cm.get_cmap('Reds', 256)
        newcolors = reds(np.linspace(0, 1, 256))
        white = np.array([255 / 256, 255 / 256, 255 / 256, 1])
        newcolors[:1, :] = white
        newcmp = ListedColormap(newcolors)

        np.fill_diagonal(tmp1.values, 0)
        labels = self.P_value.round(decimals=2)
        labels[tmp1 == 0] = ""
        labels = (np.asarray(["{0}".format(pvalue)
                              for pvalue in labels.values.flatten()])) \
            .reshape(self.jaccard_similarity.shape)

        sns.set(font_scale=1.5)
        res = sns.clustermap(tmp1, annot=labels, fmt="", cmap=newcmp,
                             row_cluster=row_cluster, col_cluster=col_cluster,
                             figsize=(tmp1.shape[0] + 3, tmp1.shape[1]),
                             annot_kws={'size': 20, "weight": "bold"})
        plt.setp(res.ax_heatmap.xaxis.get_majorticklabels(), fontsize=20, fontweight="bold", rotation=90)
        plt.setp(res.ax_heatmap.yaxis.get_majorticklabels(), fontsize=20, fontweight="bold")
        plt.yticks(rotation=0)
        res.fig.suptitle(f"comparison heatmap", fontsize=30, fontweight="bold")

        if save:
            res.savefig(f"{file_name}.{plot_format}")
        if plot_show:
            plt.show()
        else:
            plt.close()

[docs]    def plotJaccardSimilarity(self,
                              color=None,
                              cutoff=0.1,
                              figsize=None,
                              save=True,
                              plot_show=True,
                              plot_format="png",
                              file_name="jaccard_similarity"):
        """
        Plot jaccard similarity matrix as a network

        :param color: if you want to color nodes for each networks separately
        :type color: dict
        :param cutoff: threshold you used for filtering jaccard similarity
        :type cutoff: double
        :param figsize: indicate the size of plot (default is base on the number of nodes that pass cutoff)
        :type figsize: tuple of int
        :param save: indicate if you want to save the plot or not (default: True)
        :type save: bool
        :param plot_show: indicate if you want to show the plot or not (default: True)
        :type plot_show: bool
        :param plot_format: indicate the format of plot (default: png)
        :type plot_format: str
        :param file_name: name and path of the plot use for save (default: jaccard_similarity)
        :type file_name: str
        """
        df = self.jaccard_similarity
        np.fill_diagonal(df.values, 0)
        df = pd.DataFrame(df.stack())
        df.reset_index(inplace=True)
        df = df[df[0] >= cutoff]
        df.columns = ['source', 'dest', 'weight']
        if df.shape[0] == 0:
            print(f"{WARNING}None of the connections pass the cutoff{ENDC}")
            return None

        G = nx.from_pandas_edgelist(df, 'source', 'dest', 'weight')
        node_labels = {}
        nodes = list(G.nodes())
        for i in range(len(nodes)):
            node_labels[nodes[i]] = nodes[i].split(":")[1]
        edges = G.edges()
        weights = [G[u][v]['weight'] * 10 for u, v in edges]
        edge_labels = {}
        for u, v in edges:
            edge_labels[u, v] = str(round(G[u][v]['weight'], 2))

        color_map = []
        if color is None:
            color_map = None
        else:
            for node in G:
                color_map.append(color[node.split(":")[0]])

        if figsize is None:
            figsize = (len(G.nodes()) / 2, len(G.nodes()) / 2)
        fig, ax = plt.subplots(figsize=figsize, facecolor='white')
        pos = nx.spring_layout(G, k=1 / math.sqrt(len(G.nodes()) / 2))
        nx.draw_networkx(G,
                         pos=pos,
                         node_color=color_map,
                         width=weights,
                         labels=node_labels,
                         font_size=8,
                         node_size=500,
                         with_labels=True,
                         ax=ax)

        nx.draw_networkx_edge_labels(G,
                                     pos,
                                     edge_labels=edge_labels,
                                     font_size=7)

        if color is not None:
            for label in color:
                ax.plot([0], [0], color=color[label], label=label)

        plt.legend()
        plt.tight_layout()
        if save:
            plt.savefig(f"{file_name}.{plot_format}")
        if plot_show:
            plt.show()
        else:
            plt.close()

[docs]    def plotBubbleComparison(self,
                             bubble_size="jaccard_similarity",
                             cutoff=0.01,
                             color=None,
                             order1=None,
                             order2=None,
                             figsize=None,
                             save=True,
                             plot_show=True,
                             plot_format="png",
                             file_name="bubble_comparison"):
        """
        plot comparison matrix as a bubble plot

        :param bubble_size: which information you want to use for size of bubble (options: jaccard_similarity or fraction) default: jaccard_similarity
        :type bubble_size: str
        :param cutoff: threshold you used for defining significant comparison
        :type cutoff: double
        :param color: if you want to color tick labels for each networks separately
        :type color: dict
        :param order1: order of modules in PyWGCNA1 you want to show in plot (name of each elements should mapped the name of modules in your first PyWGCNA)
        :type order1: list of str
        :param order2: order of modules in PyWGCNA2 you want to show in plot (name of each elements should mapped the name of modules in your second PyWGCNA)
        :type order2: list of str
        :param figsize: indicate the size of plot (default is base on the number of modules)
        :type figsize: tuple of int
        :param save: if you want to save plot as comparison.png near to your script
        :type save: bool
        :param save: indicate if you want to save the plot or not (default: True)
        :type save: bool
        :param plot_show: indicate if you want to show the plot or not (default: True)
        :type plot_show: bool
        :param plot_format: indicate the format of plot (default: png)
        :type plot_format: str
        :param file_name: name and path of the plot use for save (default: jaccard_similarity)
        :type file_name: str
        """
        viridis = cm.get_cmap('viridis', 256)
        newcolors = viridis(np.linspace(0, 1, 256))
        grey = np.array([128 / 256, 128 / 256, 128 / 256, 1])
        newcolors[:round(256 * cutoff), :] = grey
        newcmp = ListedColormap(newcolors)

        P_value = self.P_value.copy(deep=True)
        P_value = -1 * np.log10(P_value.astype(np.float64))

        if bubble_size == "jaccard_similarity":
            size = self.jaccard_similarity.copy(deep=True)
        elif bubble_size == "fraction":
            size = self.fraction.copy(deep=True)
        else:
            sys.exit("Color is not correct!")
        np.fill_diagonal(size.values, 0)

        P_value[size == 0] = np.nan
        P_value.replace([np.inf], -1, inplace=True)
        P_value[P_value == -1] = P_value.max(numeric_only=True).max() + 1

        size[size == 0] = np.nan

        if order1 is not None:
            P_value = P_value.reindex(columns=order1)
            size = size.reindex(columns=order1)

        if order2 is not None:
            P_value = P_value.reindex(order2)
            size = size.reindex(order2)

        df = pd.DataFrame(size.stack())
        df.reset_index(inplace=True)
        df.columns = ['x', 'y', 'size']

        P_value = pd.DataFrame(P_value.stack())
        P_value.reset_index(inplace=True)
        df['-log10(P_value)'] = P_value[0].values.tolist()

        if figsize is None:
            figsize = (df.shape[0] / 70 + df.shape[0] / 400, df.shape[0] / 70)
        fig, ax = plt.subplots(figsize=figsize, facecolor='white')

        ax = sns.scatterplot(data=df,
                             x="x",
                             y="y",
                             hue="-log10(P_value)",
                             size="size",
                             palette=newcmp)

        norm = plt.Normalize(0, df['-log10(P_value)'].max())
        sm = plt.cm.ScalarMappable(cmap="viridis", norm=norm)
        sm.set_array([])

        fig.canvas.draw()

        if color is not None:
            xticks = []
            yticks = []
            for xtick, ytick in zip(ax.get_xticklabels(), ax.get_yticklabels()):
                tmp = xtick.get_text().split(":")
                xtick.set_color(color[tmp[0]])
                xticks.append(tmp[1])

                tmp = ytick.get_text().split(":")
                ytick.set_color(color[tmp[0]])
                yticks.append(tmp[1])

            ax.set_xticklabels(xticks, rotation=90)
            ax.set_yticklabels(yticks)
        else:
            ax.set_xticklabels(rotation=90)

        ax.set_xlabel('')
        ax.set_ylabel('')

        # Remove the legend and add a colorbar
        ax.get_legend().remove()
        ax.figure.colorbar(sm, shrink=0.25, label='-log10(P_value)')

        handles, labels = ax.get_legend_handles_labels()
        entries_to_skip = labels.index('size') + 1
        handles = handles[entries_to_skip:]
        labels = labels[entries_to_skip:]
        for h in handles[1:]:
            sizes = [s for s in h.get_sizes()]
            h.set_sizes(sizes)
        labels = labels[:1] + [f'{float(lab):.2f}' for lab in labels[1:]]
        legend_size = ax.legend(handles, labels,
                                title=bubble_size,
                                bbox_to_anchor=(1.04, 1),
                                loc=2,
                                borderaxespad=0.,
                                frameon=False)
        plt.gca().add_artist(legend_size)

        legend_elements = []
        if color is not None:
            for label in color:
                tmp = Line2D([0], [0], color=color[label], label=label)
                legend_elements.append(tmp)

            ax.legend(handles=legend_elements,
                      title='networks',
                      bbox_to_anchor=(1.04, 0),
                      loc=3,
                      borderaxespad=0.,
                      frameon=False)

        if save:
            plt.savefig(f"{file_name}.{plot_format}")
        if plot_show:
            plt.show()
        else:
            plt.close()

[docs]    @staticmethod
    def jaccard(list1, list2):
        """
        Calculate jaccard similarity matrix for two lists

        :param list1: first list containing the data
        :type list1: list
        :param list2: second list containing the data
        :type list2: list

        :return: jaccard similarity
        :rtype: double
        """
        intersection = len(list(set(list1).intersection(list2)))
        union = (len(list1) + len(list2)) - intersection
        return float(intersection) / union

[docs]    def saveComparison(self, name="comparison"):
        """
        save comparison object as comparison.p near to the script

        :param name: name of the pickle file (default: comparison.p)
        :type name: str
        
        """
        print(f"{BOLD}{OKBLUE}Saving comparison as {name}.p{ENDC}")

        picklefile = open(f"{name}.p", 'wb')
        pickle.dump(self, picklefile)
        picklefile.close()