Source code for HiPart.visualizations

# Copyright (c) 2022 Panagiotis Anagnostou

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""
Implementation module for the static visualization of the algorithms implemented
in the HiPart package.

@author: Panagiotis Anagnostou
@author: Nicos Pavlidis
"""

import matplotlib
import HiPart.__utility_functions as util
import math
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import numpy as np

from HiPart.clustering import BisectingKmeans
from HiPart.clustering import DePDDP
from HiPart.clustering import IPDDP
from HiPart.clustering import KMPDDP
from HiPart.clustering import PDDP
from HiPart.clustering import MDH
from scipy.cluster import hierarchy
from sklearn.decomposition import PCA
from sklearn.neighbors import KernelDensity


[docs] def split_visualization(hipart_object, color_map="viridis", mdh_split_plot=True): """ Create the visualization of each of the splits generated by one of the divisive hierarchical clustering algorithms, members of HiPart package. For each split, we visualize the data on the first two principal components, while the color of each sample is chosen depending on the cluster it belongs to. The colors throughout the separate split represent the same cluster each time. Depending on the input object, the visualization is enriched with additional information related to the algorithm that created it. For the: 1. DePDDP object, the visualization adds a marginal plot on the X-axis that represents the density of the data as extracted from the kernel density estimation for the first principal component. That is the information the dePDDP algorithm utilizes to split each cluster. 2. KMPDDP object, the visualization adds a marginal plot on the X-axis that represents the data as they are projected on the first principal component, with the addition of the centers the k-Means finds within its execution on each split. That is the information the kM_PDDP algorithm utilizes to split each cluster. 3. iPDDP along with PDDP objects, the visualization adds a marginal plot on the X-axis that represents the data as they are projected on the first principal component. That is the information the iPDDP algorithm utilizes to split each cluster. 4. BisectingKmeans object does not include additional information in the visualization. That is because of the nature of the algorithm it implements. 5. MDH object does not include additional information in the visualization. That is because of the nature of the algorithm it implements. Finally, for all the objects the visualization adds the selected split point for each split by each algorithm. This way one can validate the execution and the results of each algorithm. Moreover, examining this visualization can help the parametrization of the algorithms. Parameters ---------- hipart_object : DePDDP or IPDDP or KMPDDP or PDDP or MDH or BisectingKmeans The object member of HiPart package that we want to manipulate on the premiss of this function. color_map : string The name of the matplotlib color map to be used for the data visualization. mdh_split_plot : bool If True, the visualization of the MDH object will include the projection vector and the splitting hyperplane visualization. See visualization.mdh_visualization for more information. Raises ------ TypeError If the hipart_object is not a member of the HiPart package raise an error for the possibility of unexpected errors that will be raised if the elements of the object are not correctly structured. ValueError If the "visualization_utility" attribute of the imported object is False, this causes some needed for this visualization data not to be created. Returns ------- plt : pyplot (module) The created visualization by this function. """ if isinstance(hipart_object, MDH) and mdh_split_plot: return mdh_visualization(hipart_object, color_map) # Check data compatibility with the function if isinstance(hipart_object, DePDDP): if not hipart_object.visualization_utility: raise ValueError( """The visualization of the data cannot be archived because the visualization_utility is False.""" ) with_hist = True with_marginal_scatter = False grid_size = 4 elif ( isinstance(hipart_object, IPDDP) or isinstance(hipart_object, KMPDDP) or isinstance(hipart_object, PDDP) ): if not hipart_object.visualization_utility: raise ValueError( """The visualization of the data cannot be archived because the visualization_utility is False.""" ) with_hist = False with_marginal_scatter = True grid_size = 4 elif isinstance(hipart_object, BisectingKmeans) or isinstance(hipart_object, MDH): with_hist = False with_marginal_scatter = False grid_size = 2 else: raise TypeError("can only process objects of the 'HiPart' package.") # prepare the necessary data for this visualization ( dictionary_of_nodes, internal_nodes, _, sample_color, ) = util.visualization_preparation(hipart_object, color_map) number_of_splits = len(internal_nodes) # Ensure that the root node will be always an internal node while is the # only node in the tree if number_of_splits == 0: internal_nodes = [0] # select the number of plot sub-figures row_plots = 3 if math.ceil(number_of_splits) > 4 else 2 # set figure size fig = plt.figure( figsize=(4 * row_plots, 3.5 * math.ceil(number_of_splits / row_plots)), ) # create grid for sub-figures gs = gridspec.GridSpec( math.ceil(number_of_splits / row_plots) * grid_size, row_plots * grid_size, fig, ) hists = [] hist_indicator = 0 for i in range(number_of_splits): # Extract the subplot position row_from, row_to, col_from, col_to = util.grid_position( current=i, rows=row_plots, splits=number_of_splits, with_marginal=(with_hist or with_marginal_scatter), ) # get the color of the sample that belong to the i internal node pr_col = sample_color[ hipart_object.tree.get_node(internal_nodes[i]).data["indices"] ] # differentiate the visualizations which need explanatory margin plot # from simple visualizations if with_hist: principal_projections = dictionary_of_nodes[internal_nodes[i]].data[ "projection" ] splitPoint = dictionary_of_nodes[internal_nodes[i]].data["splitpoint"] scatter = plt.subplot(gs[(row_from + 1): row_to, col_from:col_to]) if i == 0: total_elements = ( hipart_object.tree.get_node(internal_nodes[i]) .data["indices"] .shape[0] ) hist = plt.subplot( gs[row_from: row_to - 3, col_from:col_to], sharex=scatter, ) hists.append(hist) util.make_scatter_n_hist( scatter=scatter, hist=hist, PP=principal_projections, splitPoint=splitPoint, bandwidth_scale=hipart_object.bandwidth_scale, pr_col=pr_col, scaler=total_elements, ) current_ylim = hist.get_ylim() hist.set_ylim(hists[hist_indicator].get_ylim()) if hists[hist_indicator].get_ylim()[1] < current_ylim[1]: hist_indicator = i for j in hists: j.set_ylim(current_ylim) if i == 0: hist.title.set_text( "Original data with 1st split" ) else: hist.title.set_text("Split no. " + str(i + 1)) elif with_marginal_scatter: principal_projections = dictionary_of_nodes[internal_nodes[i]].data[ "projection" ] splitPoint = dictionary_of_nodes[internal_nodes[i]].data["splitpoint"] scatter = plt.subplot(gs[(row_from + 1): row_to, col_from:col_to]) marginal_scatter = plt.subplot( gs[row_from: row_to - 3, col_from:col_to], sharex=scatter ) centers = ( dictionary_of_nodes[internal_nodes[i]].data["centers"] if isinstance(hipart_object, KMPDDP) else None ) util.make_scatter_n_marginal_scatter( scatter=scatter, marginal_scatter=marginal_scatter, PP=principal_projections, splitPoint=splitPoint, pr_col=pr_col, centers=centers, ) if i == 0: marginal_scatter.title.set_text( "Original data with 1st split" ) else: marginal_scatter.title.set_text("Split no. " + str(i + 1)) else: # Bisecting k-Means doesn't execute PCA, so we execute it here for # the visualization if isinstance(hipart_object, BisectingKmeans) or isinstance( hipart_object, MDH ): projeciton_vectors = util.execute_decomposition_method( data_matrix=hipart_object.X[ dictionary_of_nodes[internal_nodes[i]].data["indices"] ], decomposition_method="pca", two_dimentions=True, decomposition_args={}, ) principal_projections = projeciton_vectors.fit_transform( hipart_object.X[ dictionary_of_nodes[internal_nodes[i]].data["indices"] ] ) show_split = False splitPoint = None else: principal_projections = dictionary_of_nodes[internal_nodes[i]].data[ "projection" ] show_split = True splitPoint = dictionary_of_nodes[internal_nodes[i]].data["splitpoint"] # create each individual visualization ax = plt.subplot(gs[row_from:row_to, col_from:col_to]) ax = util.make_simple_scatter( sp=ax, splitPoint=splitPoint, PP=principal_projections, pr_col=pr_col, show_split=show_split, ) if i == 0: # title the subplot ax.title.set_text( "Original data with 1st split" ) else: ax.title.set_text("Split no. " + str(i + 1)) plt.subplots_adjust(wspace=0.2, hspace=0.6) return plt
[docs] def mdh_visualization(mdh_obj, color_map="viridis"): """ Create the visualization of each of the splits generated by MDH algorithm, members of HiPart package. For each split, we visualize the data on the first two principal components, while the color of each sample is chosen depending on the cluster it belongs to. The colors throughout the separate split represent the same cluster each time. Moreover, we add the line of the projection vector projected onto the first two principal components, as well as a dashed line representing the splitting hyperplane. The splitting hyperplane a line that is perpendicular to the projection vector and passes through the minimum density point of the data on the projection direction. In addition, we create add the density of the data onto the projection vector in the form of a blue line, projected onto the projection vector. With this information one can validate the execution and the results of each algorithm and examining this visualization can help the parametrization of the algorithms. Warning: the projection vector and the splitting hyperplane in the plot may not seem perpendicular to each other. This is because the projection vector is projected onto the first two principal components, and we can't predict the viewing angle of the plot, in the tuple of dimensions. For the same reason, dotted line representing the splitting hyperplane may seem misplaced from the data it splits. Parameters ---------- mdh_obj : MDH The object member of HiPart package that we want to manipulate on the premiss of this function. color_map : string The name of the matplotlib color map to be used for the data visualization. Raises ------ TypeError If the hipart_object is not a member of the HiPart package raise an error for the possibility of unexpected errors that will be raised if the elements of the object are not correctly structured. ValueError If the "visualization_utility" attribute of the imported object is False, this causes some needed for this visualization data not to be created. Returns ------- plt : pyplot (module) The created visualization by this function. """ grid_size = 2 # prepare the necessary data for this visualization ( dictionary_of_nodes, internal_nodes, _, sample_color, ) = util.visualization_preparation(mdh_obj, color_map) number_of_splits = len(internal_nodes) # Ensure that the root node will be always an internal node while is the # only node in the tree if number_of_splits == 0: internal_nodes = [0] # select the number of plot sub-figures row_plots = 3 if math.ceil(number_of_splits) > 4 else 2 # set figure size fig = plt.figure( figsize=(4 * row_plots, 3.5 * math.ceil(number_of_splits / row_plots)), ) # create grid for sub-figures gs = gridspec.GridSpec( math.ceil(number_of_splits / row_plots) * grid_size, row_plots * grid_size, fig, ) for i in range(number_of_splits): # Extract the subplot position row_from, row_to, col_from, col_to = util.grid_position( current=i, rows=row_plots, splits=number_of_splits, with_marginal=False, ) # create each individual visualization ax = plt.subplot(gs[row_from:row_to, col_from:col_to]) # get the color of the sample that belong to the i internal node pr_col = sample_color[mdh_obj.tree.get_node(internal_nodes[i]).data["indices"]] node_data = mdh_obj.X[dictionary_of_nodes[internal_nodes[i]].data["indices"]] node_data = (node_data - np.mean(node_data, 0)) / np.std(node_data, 0) pca = PCA(n_components=2) pcaproj = pca.fit_transform(node_data) v = dictionary_of_nodes[internal_nodes[i]].data["projection_vectors"] b = dictionary_of_nodes[internal_nodes[i]].data["splitpoint"] v_norm = np.linalg.norm(v) # convert v to unit vector v = v / v_norm pv = pca.transform(v.reshape(1, -1))[0] pv = pv / np.linalg.norm(pv) # $$ b = b / np.linalg.norm(pv) proj = np.dot(pcaproj, pv) h = util.band_const(mdh_obj.samples_number) * np.std(proj) x_grid = np.linspace(-3.5 * np.std(proj), 3.5 * np.std(proj), 1000) if np.abs(np.abs(pv[1]) - 1) <= 1.0e-8: ax.vlines( x=0, ymin=np.amin(x_grid), ymax=np.amax(x_grid), colors="cyan", zorder=2 ) phi = np.sign(pv[1]) * 0.5 * np.pi else: phi = np.arctan2(pv[1], pv[0]) ax.plot(x_grid, np.tan(phi) * x_grid, c="cyan", zorder=2) ax.plot(np.cos(phi) * proj, np.sin(phi) * proj, c="r", alpha=0.7, zorder=3) ax.scatter( np.cos(phi) * b, np.sin(phi) * b, marker="o", c="r", alpha=1, zorder=4 ) ax.plot( x_grid, np.sin(phi) * b + np.tan(phi + np.pi / 2) * (x_grid - np.cos(phi) * b), ":r", zorder=4, ) kde = KernelDensity(kernel="gaussian", bandwidth=h) kde.fit(proj[:, np.newaxis]) out = np.exp(kde.score_samples(x_grid[:, np.newaxis])) # plot kde P = phi R = np.matrix([[np.cos(P), -np.sin(P)], [np.sin(P), np.cos(P)]]) Y = np.matmul(R, np.row_stack((x_grid, out))) ax.plot(Y.A[0], Y.A[1], c="DarkBlue", lw=1, label="kde", zorder=4) ax.scatter(pcaproj[:, 0], pcaproj[:, 1], c=pr_col, s=5, marker=".", zorder=1) ax.set_xticks([]) ax.set_yticks([]) ax.set_xlim( min(pcaproj[:, 0]) - np.std(pcaproj[:, 0]) / 2, max(pcaproj[:, 0]) + np.std(pcaproj[:, 0]) / 2, ) ax.set_ylim( min(pcaproj[:, 1]) - np.std(pcaproj[:, 1]) / 2, max(pcaproj[:, 1]) + np.std(pcaproj[:, 1]) / 2, ) ax.grid(False) ax.margins(0.03) # title the subplot ax.title.set_text( "Original data with 1st split" ) if i == 0 else ax.title.set_text("Split no. " + str(i + 1)) plt.subplots_adjust(wspace=0.2, hspace=0.6) return plt
[docs] def dendrogram_visualization(hipart_object, cmap="viridis", default_coloring=True, **dendrogram_parameters): """ Create a dendrogram visualization of the divisive clustering based on the HiPart`s algorithm execution. The characteristic of this dendrogram is that the distance of the clusters is not the one depicted on the graph, but it represents the distance of each node from the base of the tree, also known as leaves. !! Important Note !! The "count_sort" parameter is set to True by default to preserve the hierarchical split structure of the tree. If the "count_sort" parameter is set to False, the dendrogram colors will not be assigned to the correct clusters. Parameters ---------- hipart_object : DePDDP or IPDDP or KMPDDP or PDDP or MDH or BisectingKmeans The object member of the HiPart package that we want to manipulate on the premiss of this function. cmap : string The name of the matplotlib color map to be used for the data visualization. default_coloring : bool, optional If True, the dendrogram will be colored according to the default HiPart tree coloring, based on the clustering implemented by the package. If False, the dendrogram will be colored according to the default methodology used by the `scipy.cluster.hierarchy.dendrogram` function. Note that either way the "color_threshold" parameter can be changed. **dendrogram_parameters : optional All the parameters the scipy.cluster.hierarchy.dendrogram function can take except the color_threshold parameter. Except for the "color_threshold" parameter. This parameter takes a default threshold because, in the linkage, we do not calculate the actual distance of the found clusters but only their hierarchy. Raises ------ TypeError If the hipart_object is not a member of the HiPart package, the function raises a "TypeError" error. The error is raised for the possibility of unexpected faults if the elements of the hipart_object are not correctly structured. Returns ------- dn : dict A dictionary of data structures created to render the dendrogram. """ if not ( isinstance(hipart_object, DePDDP) or isinstance(hipart_object, IPDDP) or isinstance(hipart_object, KMPDDP) or isinstance(hipart_object, PDDP) or isinstance(hipart_object, BisectingKmeans) or isinstance(hipart_object, MDH) ): raise TypeError("can only process objects of the 'HiPart' package.") if "color_threshold" in dendrogram_parameters: raise ValueError( """The color_threshold parameter is not allowed in this function. The color_threshold parameter takes a default threshold because, in the linkage, we do not calculate the actual distance of the found clusters but only their hierarchy.""" ) if "above_threshold_color" not in dendrogram_parameters: default = "C0" else: default = dendrogram_parameters["above_threshold_color"] # Initialize the color map based on the number of clusters color_map = matplotlib.cm.get_cmap(cmap, hipart_object.max_clusters_number) # Create the linkage and the color keys for the dendrogram Z, link_keys = util.create_linkage(hipart_object.tree, color_keys=True) # Create the dendrogram if default_coloring: # Create the default coloring link color function for the dendrogram dn = hierarchy.dendrogram( Z, link_color_func=lambda x: util.search_dict(link_keys, x, color_map, default), **dendrogram_parameters ) else: # Create the default colors for the dendrogram function colors = np.array([util.rgba_to_hex(color_map(i)) for i in range(hipart_object.max_clusters_number)]) # keys = np.array([i.data["color_key"] for i in hipart_object.tree.leaves()]) # list(colors[keys])) hierarchy.set_link_color_palette(list(colors)) dn = hierarchy.dendrogram(Z, color_threshold=0.3, **dendrogram_parameters) return dn
[docs] def linkage(hipart_object): """ Create the linkage of the data based on the divisive hierarchical clustering for the members of the HiParts package. The linkage's characteristic is that the cluster distance on the third column of the linkage is not the distance of the clusters with each other, but it represents the distance of each node from the base of the tree, also known as leaves. Parameters ---------- hipart_object : DePDDP or IPDDP or KMPDDP or PDDP or MDH or BisectingKmeans The object member of HiPart package that we want to manipulate on the premiss of this function. Returns ------- Z : numpy.ndarray The divisive clustering encoded as a linkage matrix. """ if not ( isinstance(hipart_object, DePDDP) or isinstance(hipart_object, IPDDP) or isinstance(hipart_object, KMPDDP) or isinstance(hipart_object, PDDP) or isinstance(hipart_object, BisectingKmeans) or isinstance(hipart_object, MDH) ): raise TypeError("can only process objects of the 'HiPart' package.") return util.create_linkage(hipart_object.tree)