"""
.. module:: matrix
:synopsis: This module implements the Matrix, HistoneModification and Hic classes.
"""
# Third-party modules
import csv
import math as m
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix
import pandas as pd
from mpl_toolkits.axes_grid1 import make_axes_locatable
from scipy import stats
[docs]class Matrix:
"""
.. class:: Matrix
This class stores a matrix and different related numpy array, plots and writes this matrix.
Attributes:
resolution (int): Resolution (or bin size) of the matrix
chrom_num (int): Chromosome chosen for processing
side (int): Square side of a numpy array sub-matrix
matrix (numpy array): Matrix stored in a numpy array
sub_matrices (numpy array): The matrix is divided into S sub-matrices of size side*side
and stored in a numpy array of shape (X, side, side, 1)
white_sub_matrices_ind (list): Position of the blank sub-matrices
total_sub_matrices (int): Total number of sub-matrices
latent_spaces (numpy array): Latent spaces (encoded sub-matrices) stored in a numpy array
predicted_sub_matrices (numpy array): Predicted sub_matrices (decoded latent spaces) stored
in a numpy array
"""
def __init__(self, resolution, chrom_num, side):
self.resolution = resolution
self.chrom_num = chrom_num
self.side = side
self.matrix = None
self.sub_matrices = None
self.white_sub_matrices_ind = None
self.total_sub_matrices = None
self.latent_spaces = None
self.predicted_sub_matrices = None
[docs] def set_sub_matrices(self):
"""
Divide the matrix into S sub-matrices of size side*side.
The empty sub-matrices (sum(values)==0) are removed from the data set.
The S resulted sub-matrices are stored in a numpy array of shape (X, side, side, 1).
"""
white_ind = [] # Index of the white sub-matrices are stored in a list
k = 0
sub_matrices_list = []
for i in range(0, self.matrix.shape[1], self.side):
for j in range(0, self.matrix.shape[1], self.side):
sub_matrix = self.matrix[i:i+self.side, j:j+self.side]
# We do not want sub-matrix with a size different than side*side
if sub_matrix.shape != (self.side, self.side):
break
# The empty sub-matrices are not taking into account
if sub_matrix.sum() != 0:
sub_matrices_list.append(sub_matrix)
else:
white_ind.append(k)
k += 1
sub_matrices = np.array(sub_matrices_list)
# The number of sub-matrices is calculated automatically by using -1 in the first field
sub_matrices = sub_matrices.reshape(-1, self.side, self.side, 1)
self.white_sub_matrices_ind = white_ind
self.total_sub_matrices = k
self.sub_matrices = sub_matrices
[docs] def set_predicted_latent_spaces(self, latent_spaces):
"""
Set the latent spaces predicted by the encoder.
Args:
latent_spaces(numpy array): The predicted latent_spaces
"""
for ind in range(self.total_sub_matrices):
if ind in self.white_sub_matrices_ind:
latent_spaces = np.insert(latent_spaces, ind, 0, axis=0)
self.latent_spaces = latent_spaces
[docs] def set_predicted_sub_matrices(self, predicted_sub_matrices):
"""
Set the sub-matrices predicted by the whole autoencoder.
Args:
predicted_sub_matrices(numpy array): The predicted sub-matrices
"""
for ind in range(self.total_sub_matrices):
if ind in self.white_sub_matrices_ind:
predicted_sub_matrices = np.insert(predicted_sub_matrices, ind, 0, axis=0)
self.predicted_sub_matrices = predicted_sub_matrices
[docs] def write_sparse_matrix(self, matrix_type, path):
"""
The reconstructed and predicted Hi-C matrix is saved in a sparse matrix file.
Args:
matrix_type(str): Matrix's name
path(str): Path of the output
"""
# Creation of the sparse matrix
sparse = coo_matrix(self.matrix)
with open('{}/{}_true.bed'.format(path, matrix_type), 'w') as file:
writer = csv.writer(file, delimiter='\t')
writer.writerows(zip(['chr'+str(self.chrom_num)]*len(sparse.row),
sparse.row*self.resolution,
sparse.col*self.resolution, sparse.data))
[docs] def plot_matrix(self, matrix_type, color_map, path):
"""
The matrix is plotted in a file.
Args:
matrix_type(str): Matrix's name
color_map(matplotlib.colors.ListedColormap): Color map
path(str): Path of the output plot
"""
fig = plt.figure(figsize=(12, 12))
axes = plt.subplot(111, aspect='equal')
img = axes.matshow(self.matrix, cmap=color_map)
divider = make_axes_locatable(axes)
cax = divider.append_axes("right", size="2%", pad=0.15)
plt.colorbar(img, cax=cax)
plt.subplots_adjust(left=0.07, bottom=0, right=0.95, top=0.91, wspace=0, hspace=0)
axes.set_title('True chr{} {} matrix'.format(self.chrom_num, matrix_type), fontsize=25)
axes.axis('off')
fig.savefig('{}/{}_true.pdf'.format(path, matrix_type))
plt.close()
[docs] def plot_distribution_matrix(self, matrix_type, path):
"""
Plot the distribution of the matrix.
Args:
matrix_type(str): Matrix's name
path(str): Path of the output plot
"""
plt.hist(self.matrix.reshape(-1), 1000)
plt.suptitle("{} distribution".format(matrix_type))
plt.savefig('{}/{}_true_distrib.pdf'.format(path, matrix_type))
plt.close()
[docs] def plot_sub_matrices(self, matrix_type, index_list, color_map, path):
"""
40 random sub-matrices are plotted in a file.
Args:
matrix_type(str): Matrix's name
index_list(list): List of the 40 sub-matrix indexes to plot
color_map(matplotlib.colors.ListedColormap): Color map
path(str): Path of the output plot
"""
fig, axes = plt.subplots(4, 10, figsize=(24, 11))
fig.suptitle('True chr{} {} sub-matrices'.format(self.chrom_num, matrix_type), fontsize=20)
fig.subplots_adjust(left=0.03, right=0.98, wspace=0.3, hspace=0.4)
i = 0
for axe, index in zip(axes.flat, self.sub_matrices[index_list, ..., 0]):
axe.imshow(index, cmap=color_map)
axe.set_title("submatrix n°{}".format(index_list[i]))
axe.axis('off')
i += 1
plt.savefig('{}/submatrices_{}_true.pdf'.format(path, matrix_type))
plt.close()
[docs]class HistoneMark(Matrix):
"""
.. class:: HistoneModification
This class inherits the Matrix class and set the matrix numpy array for a histone mark.
Attributes:
mark_df (Pandas Dataframe): Histone modification sparse matrix
"""
def __init__(self, bed_file, *args, **kwargs):
super().__init__(*args, **kwargs)
self.mark_df = pd.read_csv(bed_file, sep='\t', header=None)
self.mark_df.columns = ['chr', 'base_1', 'base_2', 'value']
self.mark_df = self.mark_df[self.mark_df['chr'] == 'chr'+str(self.chrom_num)]
[docs] def set_matrix(self):
"""
Set the histone modification numpy array of the chromosome chrom_num.
The values of the matrix are converted in float32 and rescaled by log10 and normalized.
"""
data = self.mark_df.value*100
# data = data - 1
# base_1 and base_2 columns must be converted to index by dividing by the resolution number
# This step is necesary for the creation of the sparse matrix with scipy.sparse
row = ((self.mark_df.base_1 / self.resolution)).astype(int)
col = ((self.mark_df.base_2 / self.resolution)).astype(int)
# Creation of the sparse matrix and conversion into a numpy array
size = int(max(max(self.mark_df['base_2']), max(self.mark_df['base_1'])) / self.resolution)
matrix = coo_matrix((data, (row, col)), shape=(size+1, size+1)).toarray()
# Conversion into float32
matrix = np.float32(matrix)
# Log scale to visualize better the matrix in plot
# matrix = np.log10(matrix+1)
# Rescaling of the values in range 0-1 (min-max scaling method)
matrix = (matrix - matrix.min()) / (matrix.max() - matrix.min())
self.matrix = matrix
[docs]class Hic(Matrix):
"""
.. class:: Hic
This class inherits the Matrix class and set the matrix numpy array for a Hi-C data.
Attributes:
cooler (cooler): Storage of the Hi-C matrix
"""
def __init__(self, cooler, *args, **kwargs):
super().__init__(*args, **kwargs)
self.cooler = cooler
[docs] def calculate_cum_length(self):
"""
Calculates and returns the cumulated length from chromosome 1 to N.
Returns:
Pandas DataFrame: Informations on chromosomes, their length and cumulated length
"""
chroms = self.cooler.chroms()[:]
cum_length = []
for i, size in enumerate(list(chroms.length)):
if i == 0:
cum_length.append(size)
else:
cum_length.append(size + cum_length[i-1])
chroms['cum_length'] = cum_length
return chroms
[docs] def set_matrix(self):
"""
Set the Hi-C numpy array of the chromosome chrom_num.
The matrix is transformed into an upper triangular matrix and the values are converted
in float32 and rescaled by log10 and normalized.
"""
chroms = self.calculate_cum_length()
if self.chrom_num == 1:
bin_1 = 0
else:
bin_1 = m.floor(chroms[self.chrom_num-2:self.chrom_num-1]['cum_length']/self.resolution)
bin_2 = m.ceil(chroms[self.chrom_num-1:self.chrom_num]['cum_length']/self.resolution)
# Creation of the sparse matrix and conversion into a numpy array
matrix = self.cooler.matrix(balance=False, sparse=True)[bin_1:bin_2, bin_1:bin_2].toarray()
# The matrix is symetric then we keep only the upper triangular matrix
matrix = np.triu(matrix)
# Conversion into float32
matrix = np.float32(matrix)
# Log scale to visualize better the matrix in plot
matrix = np.log10(matrix+1)
# Rescaling of the values in range 0-1
# (min-max scaling method)
# matrix = 2.0*np.sqrt(matrix + 3.0/8.0)
# stats.boxcox(matrix)
matrix = (matrix - matrix.min()) / (matrix.max() - matrix.min())
self.matrix = matrix