import abc
import pandas as pd
import tables
import networkx as nx
import numpy as np
import logging
import sys
[docs]class ReadingData(object):
"""
Abstract class used to read different types of file. You can implement your own reading method, but remember
that each subclass must implement the **readfile** and **get_data** methods
"""
def __init__(self):
super(ReadingData, self).__init__()
@abc.abstractmethod
def __readfile(self):
"""
Implement this method to read your custom file
"""
raise NotImplementedError
[docs] @abc.abstractmethod
def get_data(self):
"""
Get the data from the reading class. This method must be always overridden
"""
raise NotImplementedError
[docs]class ReadTsv(ReadingData):
"""
This class is used to read and parse a network file in a tab-separated format (tsv).
"""
def __init__(self, filename: str, pd_table: bool = False, int_type: int = None):
"""
:param filename: represents the path to the network file
:param pd_table: if the results is going to be a pd.dataframe
"""
super().__init__()
self.filename = filename
self.int_type = int_type
self.pd_table = pd_table
self.interactions = []
if not self.pd_table:
self.__readfile()
self.graph = self._convert_to_graph()
def __readfile(self) -> None:
"""
This method read the file and saves the data inside a class attribute
"""
with open(self.filename, "r") as f:
for record in f:
if record.startswith("#"):
continue
fields = record.strip().split("\t")
if self.int_type:
types = fields[3].split(";")
if self.int_type in types:
self.interactions.append((fields[0], fields[1]))
else:
continue
else:
self.interactions.append((fields[0], fields[1]))
def _convert_to_graph(self) -> nx.Graph:
"""
Converts the interactions into a graph object
:return: graph from the interactions
"""
graph = nx.Graph()
graph.add_edges_from(self.interactions)
graph.remove_edges_from(graph.selfloop_edges())
return graph
[docs] def get_data(self) -> pd.DataFrame or list:
"""
Returns the data of the tsv file
:return: list representing the genes read in the file
Example
_______
>>> tsvdata = ReadTsv("mydata.tsv").get_data()
"""
if self.pd_table:
return pd.read_table(self.filename)
else:
return self.interactions
[docs] def get_network(self) -> nx.Graph:
"""
Returns the nx.graph object of the network
:return: graph containing the network information
Example
_______
>>> tsvdata = ReadTsv("mydata.tsv").get_network()
"""
return self.graph
[docs]class ReadGmt(ReadingData):
"""
This class is used to read a gmt file, which contains information about the genes with a setname and separated by a comma
"""
def __init__(self, filename: str, read_descriptor: bool = False):
"""
:param filename: represents the path to the geneset file
:param read_descriptor: if the descriptor is given. Default = False
"""
super().__init__()
self.filename = filename
self.read_descriptor = read_descriptor
self.gmt_data = self.__readfile()
def __readfile(self) -> dict:
"""
This method reads the geneset file into a variable
:return: gene_list representing the list of genes
"""
gene_lists = dict()
with open(self.filename, "r") as f:
for record in f:
fields = record.strip().split("\t")
if self.read_descriptor:
gene_lists[fields[0]] = {}
gene_lists[fields[0]]["genes"] = fields[2:]
gene_lists[fields[0]]["descriptor"] = fields[1]
else:
gene_lists[fields[0]] = fields[2:]
return gene_lists
[docs] def get_data(self) -> dict:
"""
Returns the data of the gmt file
:return: dict representing the genes list
Example
_______
>>> gmtdata = ReadGmt("mydata.gmt").get_data()
"""
return self.gmt_data
[docs] def get_geneset(self, setname: str = None) -> dict:
"""
Returns the geneset from the gmt file
:param setname: the setname to extract
:return: the geneset data
Example
_______
>>> gmtdata = ReadGmt("mydata.gmt").get_geneset("brca")
"""
if setname is not None:
if setname in self.gmt_data:
temp = self.gmt_data[setname]
self.gmt_data.clear()
self.gmt_data[setname] = temp
else:
logging.error("Cannot find geneset: %s" % setname)
sys.exit(-1)
return self.gmt_data
[docs]class ReadCsv(ReadingData):
"""
This class is used to read a csv file.
"""
def __init__(self, filename: str, sep: str = ",", use_cols: list = None, column_to_fill: str = None):
"""
:param filename: represents the path to the data file
:param sep: the separator to be used
:param use_cols: columns used to be read and grouped
:param column_to_fill: column to fill the NA values
"""
super().__init__()
self.filename = filename
self.sep = sep
self.use_cols = use_cols
self.name_column = column_to_fill
self.data = self.__readfile()
if self.name_column is not None:
self._fill_na_column()
def __readfile(self) -> pd.DataFrame:
"""
This method read the file and saves the data inside a class attribute
:return: dataframe representing teh data read inside the .csv
"""
with open(self.filename, "r") as f:
table = pd.read_csv(f, sep=self.sep, usecols=self.use_cols)
return table
[docs] def get_data(self) -> pd.DataFrame:
"""
Returns the data of the csv file
:return: dataframe representing the data read inside the .csv
Example
_______
>>> csvdata = ReadCsv("mydata.csv").get_data()
"""
return self.data
def _fill_na_column(self) -> None:
"""
Fill the N/A values with a (str) 0
"""
self.data[self.name_column].fillna(0, inplace=True)
self.data[self.name_column] = self.data[self.name_column].astype(int)
[docs]class ReadTxt(ReadingData):
"""
This class reads a txt file containing a single gene per line
"""
def __init__(self, filename: str):
super().__init__()
self.filename = filename
self.data = []
self.__readfile()
def __readfile(self) -> None:
"""
Read the file, line per line
"""
with open(self.filename, "r") as f:
gene_line = f.readline()
while gene_line:
self.data.append(gene_line)
gene_line = f.readline()
[docs] def get_data(self) -> pd.DataFrame:
"""
Get the dataframe from the class
:return: dataframe object from the file read
Example
_______
>>> txtdata = ReadTxt("mydata.txt").get_data()
"""
return pd.DataFrame(self.data)
[docs]class ReadDistanceMatrix(ReadingData):
"""
This class read a distance matrix in the HDF5 format
"""
def __init__(self, filename: str, in_memory: bool = False):
"""
:param filename: the path of the file to be read
:param in_memory: keep the matrix in memory or not
"""
super().__init__()
self.filename = filename
self.nodes = None
self.data = None
self.memory = in_memory
self.__readfile()
if type(self.nodes[0]) == bytes:
self._decode()
def __readfile(self) -> None:
"""
This method read and stores matrix information in memory or by reading it on the disk
"""
if self.memory:
hdf5_file = tables.open_file(self.filename, mode="r", driver="H5FD_CORE")
else:
hdf5_file = tables.open_file(self.filename, mode="r")
self.nodes = list(hdf5_file.root.nodes[:])
self.data = hdf5_file.root.matrix[:]
def _decode(self) -> None:
"""
Elaborate teh nodes from the graph
"""
self.nodes = [i.decode() for i in self.nodes]
[docs] def get_data(self) -> [list, np.matrix]:
"""Return the data of the HDF5 Matrix
:return: table data, the data of the HDF5 Matrix and table nodes, the nodes of the HDF5 Matrix
Example
_______
>>> nodes, data = ReadDistanceMatrix("mydata.hdf5").get_data()
"""
return self.nodes, self.data