Source code for pygna.reading_class

import abc

import pandas as pd
import tables
import networkx as nx
import numpy as np
import logging
import sys


[docs]class ReadingData(object): """ Abstract class used to read different types of file. You can implement your own reading method, but remember that each subclass must implement the **readfile** and **get_data** methods """ def __init__(self): super(ReadingData, self).__init__() @abc.abstractmethod def __readfile(self): """ Implement this method to read your custom file """ raise NotImplementedError
[docs] @abc.abstractmethod def get_data(self): """ Get the data from the reading class. This method must be always overridden """ raise NotImplementedError
[docs]class ReadTsv(ReadingData): """ This class is used to read and parse a network file in a tab-separated format (tsv). """ def __init__(self, filename: str, pd_table: bool = False, int_type: int = None): """ :param filename: represents the path to the network file :param pd_table: if the results is going to be a pd.dataframe """ super().__init__() self.filename = filename self.int_type = int_type self.pd_table = pd_table self.interactions = [] if not self.pd_table: self.__readfile() self.graph = self._convert_to_graph() def __readfile(self) -> None: """ This method read the file and saves the data inside a class attribute """ with open(self.filename, "r") as f: for record in f: if record.startswith("#"): continue fields = record.strip().split("\t") if self.int_type: types = fields[3].split(";") if self.int_type in types: self.interactions.append((fields[0], fields[1])) else: continue else: self.interactions.append((fields[0], fields[1])) def _convert_to_graph(self) -> nx.Graph: """ Converts the interactions into a graph object :return: graph from the interactions """ graph = nx.Graph() graph.add_edges_from(self.interactions) graph.remove_edges_from(graph.selfloop_edges()) return graph
[docs] def get_data(self) -> pd.DataFrame or list: """ Returns the data of the tsv file :return: list representing the genes read in the file Example _______ >>> tsvdata = ReadTsv("mydata.tsv").get_data() """ if self.pd_table: return pd.read_table(self.filename) else: return self.interactions
[docs] def get_network(self) -> nx.Graph: """ Returns the nx.graph object of the network :return: graph containing the network information Example _______ >>> tsvdata = ReadTsv("mydata.tsv").get_network() """ return self.graph
[docs]class ReadGmt(ReadingData): """ This class is used to read a gmt file, which contains information about the genes with a setname and separated by a comma """ def __init__(self, filename: str, read_descriptor: bool = False): """ :param filename: represents the path to the geneset file :param read_descriptor: if the descriptor is given. Default = False """ super().__init__() self.filename = filename self.read_descriptor = read_descriptor self.gmt_data = self.__readfile() def __readfile(self) -> dict: """ This method reads the geneset file into a variable :return: gene_list representing the list of genes """ gene_lists = dict() with open(self.filename, "r") as f: for record in f: fields = record.strip().split("\t") if self.read_descriptor: gene_lists[fields[0]] = {} gene_lists[fields[0]]["genes"] = fields[2:] gene_lists[fields[0]]["descriptor"] = fields[1] else: gene_lists[fields[0]] = fields[2:] return gene_lists
[docs] def get_data(self) -> dict: """ Returns the data of the gmt file :return: dict representing the genes list Example _______ >>> gmtdata = ReadGmt("mydata.gmt").get_data() """ return self.gmt_data
[docs] def get_geneset(self, setname: str = None) -> dict: """ Returns the geneset from the gmt file :param setname: the setname to extract :return: the geneset data Example _______ >>> gmtdata = ReadGmt("mydata.gmt").get_geneset("brca") """ if setname is not None: if setname in self.gmt_data: temp = self.gmt_data[setname] self.gmt_data.clear() self.gmt_data[setname] = temp else: logging.error("Cannot find geneset: %s" % setname) sys.exit(-1) return self.gmt_data
[docs]class ReadCsv(ReadingData): """ This class is used to read a csv file. """ def __init__(self, filename: str, sep: str = ",", use_cols: list = None, column_to_fill: str = None): """ :param filename: represents the path to the data file :param sep: the separator to be used :param use_cols: columns used to be read and grouped :param column_to_fill: column to fill the NA values """ super().__init__() self.filename = filename self.sep = sep self.use_cols = use_cols self.name_column = column_to_fill self.data = self.__readfile() if self.name_column is not None: self._fill_na_column() def __readfile(self) -> pd.DataFrame: """ This method read the file and saves the data inside a class attribute :return: dataframe representing teh data read inside the .csv """ with open(self.filename, "r") as f: table = pd.read_csv(f, sep=self.sep, usecols=self.use_cols) return table
[docs] def get_data(self) -> pd.DataFrame: """ Returns the data of the csv file :return: dataframe representing the data read inside the .csv Example _______ >>> csvdata = ReadCsv("mydata.csv").get_data() """ return self.data
def _fill_na_column(self) -> None: """ Fill the N/A values with a (str) 0 """ self.data[self.name_column].fillna(0, inplace=True) self.data[self.name_column] = self.data[self.name_column].astype(int)
[docs]class ReadTxt(ReadingData): """ This class reads a txt file containing a single gene per line """ def __init__(self, filename: str): super().__init__() self.filename = filename self.data = [] self.__readfile() def __readfile(self) -> None: """ Read the file, line per line """ with open(self.filename, "r") as f: gene_line = f.readline() while gene_line: self.data.append(gene_line) gene_line = f.readline()
[docs] def get_data(self) -> pd.DataFrame: """ Get the dataframe from the class :return: dataframe object from the file read Example _______ >>> txtdata = ReadTxt("mydata.txt").get_data() """ return pd.DataFrame(self.data)
[docs]class ReadDistanceMatrix(ReadingData): """ This class read a distance matrix in the HDF5 format """ def __init__(self, filename: str, in_memory: bool = False): """ :param filename: the path of the file to be read :param in_memory: keep the matrix in memory or not """ super().__init__() self.filename = filename self.nodes = None self.data = None self.memory = in_memory self.__readfile() if type(self.nodes[0]) == bytes: self._decode() def __readfile(self) -> None: """ This method read and stores matrix information in memory or by reading it on the disk """ if self.memory: hdf5_file = tables.open_file(self.filename, mode="r", driver="H5FD_CORE") else: hdf5_file = tables.open_file(self.filename, mode="r") self.nodes = list(hdf5_file.root.nodes[:]) self.data = hdf5_file.root.matrix[:] def _decode(self) -> None: """ Elaborate teh nodes from the graph """ self.nodes = [i.decode() for i in self.nodes]
[docs] def get_data(self) -> [list, np.matrix]: """Return the data of the HDF5 Matrix :return: table data, the data of the HDF5 Matrix and table nodes, the nodes of the HDF5 Matrix Example _______ >>> nodes, data = ReadDistanceMatrix("mydata.hdf5").get_data() """ return self.nodes, self.data