Source code for pygna.diagnostic

import matplotlib.pyplot as plt
import seaborn as sns
import logging
import numpy as np
from matplotlib.offsetbox import AnchoredText
import networkx as nx


[docs]def plot_degree(degree_object: nx.Graph, output_file: str): """ Diagnosis tool for the degree object. It takes a graph and plot its statistics :param degree_object: the graph to plot :param output_file: the path to save the file Example _______ >>> graph = nx.complete_graph(100) >>> plot_degree(nx.degree(graph), "graph_degree") """ D = dict(degree_object) degrees = {k: v for k, v in D.items()} degree_values = np.array(list(degrees.values())) fig, axes = plt.subplots(1, figsize=(10, 10)) g1 = sns.distplot(degree_values, hist=True, ax=axes) key_max = max(degrees.keys(), key=(lambda k: degrees[k])) g1 = sns.distplot([degrees[key_max]], hist=False, kde=False, rug=True, color='r', ax=axes) axes.annotate('%s: %d' % (key_max, degrees[key_max]), xy=(degrees[key_max], 0), xytext=(degrees[key_max], axes.dataLim.y1 / 2), arrowprops=dict(arrowstyle="->")) g1 = sns.distplot([np.median(degree_values)], hist=False, kde=False, rug=True, color='r', ax=axes) axes.annotate('median %f' % np.median(degree_values), xy=(np.median(degree_values), 0), xytext=(np.median(degree_values), axes.dataLim.y1 / 2), arrowprops=dict(arrowstyle="->")) sns.despine(ax=axes, top=True, bottom=False, right=True, left=True) g1.set_ylabel("Density") g1.set_xlabel("Node Degree") if output_file.endswith('.pdf'): plt.savefig(output_file, format="pdf") elif output_file.endswith('.png'): plt.savefig(output_file, format="png") else: logging.warning('The null distribution figure can only be saved in pdf or png, forced to png') fig.savefig(output_file + '.png', format="png")
[docs]def plot_connected_components(c_components: nx.connected_components, output_file: str) -> None: """ Diagnosis tool for the connected components object. Creates the histogram of the components length, to analyse the relationship between the lcc and the other c_components, and prints some overall stats about the connected components :param c_components: the list of the connected components :param output_file: the path to save the file Example _______ >>> from pygna import diagnostic >>> import pygna.reading_class as rc >>> network = rc.ReadTsv("network_file.tsv").get_network() >>> geneset = rc.ReadGmt("geneset_input_file.gmt").get_geneset("brca") >>> c_components_figure_file = "components_plot.pdf" >>> for setname, item in geneset.items(): ... graph = nx.subgraph(network, item) ... diagnostic.plot_connected_components(nx.connected_components(graph), c_components_figure_file) """ c_components_len = [len(k) for k in c_components] logging.info("Number of cc %d" % (len(c_components_len))) logging.info("First five cc" + str(c_components_len[0:5])) logging.info("Mean length of cc %d" % (np.mean(c_components_len))) fig, axes = plt.subplots(1, figsize=(10, 10)) g1 = sns.distplot(c_components_len, hist=True, kde=False, ax=axes, norm_hist=False) g1 = sns.distplot([np.max(c_components_len)], hist=False, kde=False, rug=True, color='r', ax=axes, norm_hist=False) axes.annotate('LCC: %d' % np.max(c_components_len), xy=(np.max(c_components_len), 0), xytext=(np.max(c_components_len) - 10, axes.dataLim.y1 / 4), arrowprops=dict(arrowstyle="->")) sns.despine(ax=axes, top=True, bottom=False, right=True, left=True) g1.set_ylabel("Number of CC") g1.set_xlabel("Size of CC") if output_file.endswith('.pdf'): plt.savefig(output_file, format="pdf") elif output_file.endswith('.png'): plt.savefig(output_file, format="png") else: logging.warning('The null distribution figure can only be saved in pdf or png, forced to png') fig.savefig(output_file + '.png', format="png")
[docs]def plot_diffusion_matrix(nodes: list, matrix: np.ndarray, filename: str, show_labels: bool = False) -> None: """ Diagnosis tool for a diffusion matrix. It shows the weighted adjacency matrix that is the output of a build process :param nodes: the network nodes :param matrix: the diffusion matrix :param filename: the path to save the file :param show_labels: if labels should be plotted Example _______ >>> nodes = ["A", "B", "C"] >>> matrix = np.random.rand(3,2) >>> plot_diffusion_matrix(nodes, matrix, "diff_matrix.pdf") """ logging.info("Plotting figure as " + str(filename)) fig, axes = plt.subplots(1) axes.imshow(matrix, cmap="PuBu") if show_labels: pass plt.show() fig.savefig(filename + ".pdf", format="pdf")
[docs]def plot_null_distribution(null_distribution: list, observed: list, output_file: str, setname: str, alternative: str = "greater") -> None: """ Saves the density plot of the null distribution and pinpoints the observed value. Please refer to the CLI for the usage of this function :param null_distribution: the list with the values from the null distribution :param observed: list of the observed genes :param output_file: the path to save the file :param setname: the name of the gene set :param alternative: use "greater" if you want to take the genes with greater than the observed value Example _______ >>> from pygna import diagnostic >>> import pygna.statistical_test as st >>> import pygna.reading_class as rc >>> network = rc.ReadTsv("network_file.tsv").get_network() >>> st_test = st.StatisticalTest(st.geneset_total_degree_statistic, network) >>> geneset = rc.ReadGmt("geneset_file.gmt").get_geneset("brca") >>> for setname, item in geneset.items(): ... observed, pvalue, null_d, n_mapped, n_geneset = st_test.empirical_pvalue(item, max_iter=500, alternative="greater", cores=10) ... diagnostic.plot_null_distribution(null_d, observed, "./results/" + setname +'_total_degree_null_distribution.pdf', setname=setname) """ fig, axes = plt.subplots(1, figsize=(8, 6)) g1 = sns.distplot(null_distribution, hist=True, kde=True, rug=False, ax=axes) if alternative == "greater": if len(null_distribution[null_distribution > observed]): g3 = sns.distplot(null_distribution[null_distribution > observed], hist=False, kde=False, rug=True, rug_kws={'height': 1 / 50}, color="r", ax=axes) else: if len(null_distribution[null_distribution < observed]): g3 = sns.distplot(null_distribution[null_distribution < observed], hist=False, kde=False, rug=True, rug_kws={'height': 1 / 50}, color="r", ax=axes) ymax = axes.dataLim.y1 xmax = axes.dataLim.x1 print('xmax %f' % xmax) g4 = axes.stem([observed], [ymax / 2], "r", "r--") sns.despine(ax=axes, top=True, bottom=False, right=True, left=True) anchored_text = AnchoredText("Observed:%1.1E" % observed, loc=1, prop={'fontsize': 18, 'color': 'r'}, **{'frameon': False}) axes.add_artist(anchored_text) axes.set_xlabel('Statistics', fontsize=18) axes.set_ylabel('Density', fontsize=18) logging.info("Output for diagnostic null distribution: " + output_file) if output_file.endswith('.pdf'): fig.savefig(output_file, format="pdf") elif output_file.endswith('.png'): fig.savefig(output_file, format="png") else: logging.warning('The null distribution figure can only be saved in pdf or png, forced to png') fig.savefig(output_file + '.png', format="png")