Source code for src.matrix

"""
.. module:: matrix
   :synopsis: This module implements the Matrix, HistoneModification and Hic classes.
"""

# Third-party modules
import csv
import math as m
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix
import pandas as pd
from mpl_toolkits.axes_grid1 import make_axes_locatable
from scipy import stats


[docs]class Matrix:
    """
    .. class:: Matrix
        This class stores a matrix and different related numpy array, plots and writes this matrix.

    Attributes:
        resolution (int): Resolution (or bin size) of the matrix
        chrom_num (int): Chromosome chosen for processing
        side (int): Square side of a numpy array sub-matrix
        matrix (numpy array): Matrix stored in a numpy array
        sub_matrices (numpy array): The matrix is divided into S sub-matrices of size side*side
                                    and stored in a numpy array of shape (X, side, side, 1)
        white_sub_matrices_ind (list): Position of the blank sub-matrices
        total_sub_matrices (int): Total number of sub-matrices
        latent_spaces (numpy array): Latent spaces (encoded sub-matrices) stored in a numpy array
        predicted_sub_matrices (numpy array): Predicted sub_matrices (decoded latent spaces) stored
                                              in a numpy array
    """

    def __init__(self, resolution, chrom_num, side):
        self.resolution = resolution
        self.chrom_num = chrom_num
        self.side = side
        self.matrix = None
        self.sub_matrices = None
        self.white_sub_matrices_ind = None
        self.total_sub_matrices = None
        self.latent_spaces = None
        self.predicted_sub_matrices = None

[docs]    def set_sub_matrices(self):
        """
            Divide the matrix into S sub-matrices of size side*side.
            The empty sub-matrices (sum(values)==0) are removed from the data set.
            The S resulted sub-matrices are stored in a numpy array of shape (X, side, side, 1).
        """
        white_ind = [] # Index of the white sub-matrices are stored in a list
        k = 0
        sub_matrices_list = []
        for i in range(0, self.matrix.shape[1], self.side):
            for j in range(0, self.matrix.shape[1], self.side):
                sub_matrix = self.matrix[i:i+self.side, j:j+self.side]
                # We do not want sub-matrix with a size different than side*side
                if sub_matrix.shape != (self.side, self.side):
                    break
                # The empty sub-matrices are not taking into account
                if sub_matrix.sum() != 0:
                    sub_matrices_list.append(sub_matrix)
                else:
                    white_ind.append(k)
                k += 1
        sub_matrices = np.array(sub_matrices_list)
        # The number of sub-matrices is calculated automatically by using -1 in the first field
        sub_matrices = sub_matrices.reshape(-1, self.side, self.side, 1)
        self.white_sub_matrices_ind = white_ind
        self.total_sub_matrices = k
        self.sub_matrices = sub_matrices

[docs]    def set_predicted_latent_spaces(self, latent_spaces):
        """
            Set the latent spaces predicted by the encoder.

            Args:
                latent_spaces(numpy array): The predicted latent_spaces
        """
        for ind in range(self.total_sub_matrices):
            if ind in self.white_sub_matrices_ind:
                latent_spaces = np.insert(latent_spaces, ind, 0, axis=0)
        self.latent_spaces = latent_spaces

[docs]    def set_predicted_sub_matrices(self, predicted_sub_matrices):
        """
            Set the sub-matrices predicted by the whole autoencoder.

            Args:
                predicted_sub_matrices(numpy array): The predicted sub-matrices
        """
        for ind in range(self.total_sub_matrices):
            if ind in self.white_sub_matrices_ind:
                predicted_sub_matrices = np.insert(predicted_sub_matrices, ind, 0, axis=0)
        self.predicted_sub_matrices = predicted_sub_matrices

[docs]    def write_sparse_matrix(self, matrix_type, path):
        """
            The reconstructed and predicted Hi-C matrix is saved in a sparse matrix file.

            Args:
                matrix_type(str): Matrix's name
                path(str): Path of the output
        """
        # Creation of the sparse matrix
        sparse = coo_matrix(self.matrix)
        with open('{}/{}_true.bed'.format(path, matrix_type), 'w') as file:
            writer = csv.writer(file, delimiter='\t')
            writer.writerows(zip(['chr'+str(self.chrom_num)]*len(sparse.row),
                                 sparse.row*self.resolution,
                                 sparse.col*self.resolution, sparse.data))

[docs]    def plot_matrix(self, matrix_type, color_map, path):
        """
            The matrix is plotted in a file.

            Args:
                matrix_type(str): Matrix's name
                color_map(matplotlib.colors.ListedColormap): Color map
                path(str): Path of the output plot
        """
        fig = plt.figure(figsize=(12, 12))
        axes = plt.subplot(111, aspect='equal')
        img = axes.matshow(self.matrix, cmap=color_map)
        divider = make_axes_locatable(axes)
        cax = divider.append_axes("right", size="2%", pad=0.15)
        plt.colorbar(img, cax=cax)
        plt.subplots_adjust(left=0.07, bottom=0, right=0.95, top=0.91, wspace=0, hspace=0)
        axes.set_title('True chr{} {} matrix'.format(self.chrom_num, matrix_type), fontsize=25)
        axes.axis('off')
        fig.savefig('{}/{}_true.pdf'.format(path, matrix_type))
        plt.close()

[docs]    def plot_distribution_matrix(self, matrix_type, path):
        """
            Plot the distribution of the matrix.

            Args:
                matrix_type(str): Matrix's name
                path(str): Path of the output plot
        """
        plt.hist(self.matrix.reshape(-1), 1000)
        plt.suptitle("{} distribution".format(matrix_type))
        plt.savefig('{}/{}_true_distrib.pdf'.format(path, matrix_type))
        plt.close()

[docs]    def plot_sub_matrices(self, matrix_type, index_list, color_map, path):
        """
            40 random sub-matrices are plotted in a file.

            Args:
                matrix_type(str): Matrix's name
                index_list(list): List of the 40 sub-matrix indexes to plot
                color_map(matplotlib.colors.ListedColormap): Color map
                path(str): Path of the output plot
        """
        fig, axes = plt.subplots(4, 10, figsize=(24, 11))
        fig.suptitle('True chr{} {} sub-matrices'.format(self.chrom_num, matrix_type), fontsize=20)
        fig.subplots_adjust(left=0.03, right=0.98, wspace=0.3, hspace=0.4)
        i = 0
        for axe, index in zip(axes.flat, self.sub_matrices[index_list, ..., 0]):
            axe.imshow(index, cmap=color_map)
            axe.set_title("submatrix n°{}".format(index_list[i]))
            axe.axis('off')
            i += 1
        plt.savefig('{}/submatrices_{}_true.pdf'.format(path, matrix_type))
        plt.close()


[docs]class HistoneMark(Matrix):
    """
    .. class:: HistoneModification
        This class inherits the Matrix class and set the matrix numpy array for a histone mark.

    Attributes:
        mark_df (Pandas Dataframe): Histone modification sparse matrix
    """

    def __init__(self, bed_file, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.mark_df = pd.read_csv(bed_file, sep='\t', header=None)
        self.mark_df.columns = ['chr', 'base_1', 'base_2', 'value']
        self.mark_df = self.mark_df[self.mark_df['chr'] == 'chr'+str(self.chrom_num)]

[docs]    def set_matrix(self):
        """
            Set the histone modification numpy array of the chromosome chrom_num.
            The values of the matrix are converted in float32 and rescaled by log10 and normalized.
        """
        data = self.mark_df.value*100
        # data = data - 1
        # base_1 and base_2 columns must be converted to index by dividing by the resolution number
        # This step is necesary for the creation of the sparse matrix with scipy.sparse
        row = ((self.mark_df.base_1 / self.resolution)).astype(int)
        col = ((self.mark_df.base_2 / self.resolution)).astype(int)
        # Creation of the sparse matrix and conversion into a numpy array
        size = int(max(max(self.mark_df['base_2']), max(self.mark_df['base_1'])) / self.resolution)
        matrix = coo_matrix((data, (row, col)), shape=(size+1, size+1)).toarray()
        # Conversion into float32
        matrix = np.float32(matrix)
        # Log scale to visualize better the matrix in plot
        # matrix = np.log10(matrix+1)
        # Rescaling of the values in range 0-1 (min-max scaling method)
        matrix = (matrix - matrix.min()) / (matrix.max() - matrix.min())
        self.matrix = matrix


[docs]class Hic(Matrix):
    """
    .. class:: Hic
        This class inherits the Matrix class and set the matrix numpy array for a Hi-C data.

    Attributes:
        cooler (cooler): Storage of the Hi-C matrix
    """

    def __init__(self, cooler, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.cooler = cooler


[docs]    def calculate_cum_length(self):
        """
            Calculates and returns the cumulated length from chromosome 1 to N.

            Returns:
                Pandas DataFrame: Informations on chromosomes, their length and cumulated length

        """
        chroms = self.cooler.chroms()[:]
        cum_length = []
        for i, size in enumerate(list(chroms.length)):
            if i == 0:
                cum_length.append(size)
            else:
                cum_length.append(size + cum_length[i-1])
        chroms['cum_length'] = cum_length
        return chroms

[docs]    def set_matrix(self):
        """
            Set the Hi-C numpy array  of the chromosome chrom_num.
            The matrix is transformed into an upper triangular matrix and the values are converted
            in float32 and rescaled by log10 and normalized.
        """
        chroms = self.calculate_cum_length()
        if self.chrom_num == 1:
            bin_1 = 0
        else:
            bin_1 = m.floor(chroms[self.chrom_num-2:self.chrom_num-1]['cum_length']/self.resolution)
        bin_2 = m.ceil(chroms[self.chrom_num-1:self.chrom_num]['cum_length']/self.resolution)
        # Creation of the sparse matrix and conversion into a numpy array
        matrix = self.cooler.matrix(balance=False, sparse=True)[bin_1:bin_2, bin_1:bin_2].toarray()
        # The matrix is symetric then we keep only the upper triangular matrix
        matrix = np.triu(matrix)
        # Conversion into float32
        matrix = np.float32(matrix)
        # Log scale to visualize better the matrix in plot
        matrix = np.log10(matrix+1)
        # Rescaling of the values in range 0-1
        # (min-max scaling method)
        # matrix = 2.0*np.sqrt(matrix + 3.0/8.0)
        # stats.boxcox(matrix)
        matrix = (matrix - matrix.min()) / (matrix.max() - matrix.min())
        self.matrix = matrix