Source code for HiPart.visualizations

# Copyright (c) 2022 Panagiotis Anagnostou

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""
Implementation module for the static visualization of the algorithms implemented
in the HiPart package.

@author: Panagiotis Anagnostou
@author: Nicos Pavlidis
"""

import matplotlib
import HiPart.__utility_functions as util
import math
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import numpy as np

from HiPart.clustering import BisectingKmeans
from HiPart.clustering import DePDDP
from HiPart.clustering import IPDDP
from HiPart.clustering import KMPDDP
from HiPart.clustering import PDDP
from HiPart.clustering import MDH
from scipy.cluster import hierarchy
from sklearn.decomposition import PCA
from sklearn.neighbors import KernelDensity



[docs]
def split_visualization(hipart_object, color_map="viridis", mdh_split_plot=True):
    """
    Create the visualization of each of the splits generated by one of the
    divisive hierarchical clustering algorithms, members of HiPart package.
    For each split, we visualize the data on the first two principal
    components, while the color of each sample is chosen depending on the
    cluster it belongs to. The colors throughout the separate split represent
    the same cluster each time.

    Depending on the input object, the visualization is enriched with
    additional information related to the algorithm that created it. For the:

    1. DePDDP object, the visualization adds a marginal plot on the X-axis that
    represents the density of the data as extracted from the kernel density
    estimation for the first principal component. That is the information the
    dePDDP algorithm utilizes to split each cluster.

    2. KMPDDP object, the visualization adds a marginal plot on the X-axis
    that represents the data as they are projected on the first principal
    component, with the addition of the centers the k-Means finds within its
    execution on each split. That is the information the kM_PDDP algorithm
    utilizes to split each cluster.

    3. iPDDP along with PDDP objects, the visualization adds a marginal plot on
    the X-axis that represents the data as they are projected on the first
    principal component. That is the information the iPDDP algorithm utilizes
    to split each cluster.

    4. BisectingKmeans object does not include additional information in the
    visualization. That is because of the nature of the algorithm it
    implements.

    5. MDH object does not include additional information in the visualization.
    That is because of the nature of the algorithm it implements.

    Finally, for all the objects the visualization adds the selected split
    point for each split by each algorithm. This way one can validate the
    execution and the results of each algorithm. Moreover, examining this
    visualization can help the parametrization of the algorithms.

    Parameters
    ----------
    hipart_object : DePDDP or IPDDP or KMPDDP or PDDP or MDH or BisectingKmeans
        The object member of HiPart package that we want to manipulate on the
        premiss of this function.
    color_map : string
        The name of the matplotlib color map to be used for the data
        visualization.
    mdh_split_plot : bool
        If True, the visualization of the MDH object will include the projection
        vector and the splitting hyperplane visualization. See
        visualization.mdh_visualization for more information.

    Raises
    ------
    TypeError
        If the hipart_object is not a member of the HiPart package raise an
        error for the possibility of unexpected errors that will be raised
        if the elements of the object are not correctly structured.
    ValueError
        If the "visualization_utility" attribute of the imported object is
        False, this causes some needed for this visualization data not
        to be created.

    Returns
    -------
    plt : pyplot (module)
        The created visualization by this function.

    """

    if isinstance(hipart_object, MDH) and mdh_split_plot:
        return mdh_visualization(hipart_object, color_map)

    # Check data compatibility with the function
    if isinstance(hipart_object, DePDDP):
        if not hipart_object.visualization_utility:
            raise ValueError(
                """The visualization of the data cannot be archived because the
                visualization_utility is False."""
            )
        with_hist = True
        with_marginal_scatter = False
        grid_size = 4
    elif (
        isinstance(hipart_object, IPDDP)
        or isinstance(hipart_object, KMPDDP)
        or isinstance(hipart_object, PDDP)
    ):
        if not hipart_object.visualization_utility:
            raise ValueError(
                """The visualization of the data cannot be archived because the
                visualization_utility is False."""
            )
        with_hist = False
        with_marginal_scatter = True
        grid_size = 4
    elif isinstance(hipart_object, BisectingKmeans) or isinstance(hipart_object, MDH):
        with_hist = False
        with_marginal_scatter = False
        grid_size = 2
    else:
        raise TypeError("can only process objects of the 'HiPart' package.")

    # prepare the necessary data for this visualization
    (
        dictionary_of_nodes,
        internal_nodes,
        _,
        sample_color,
    ) = util.visualization_preparation(hipart_object, color_map)
    number_of_splits = len(internal_nodes)

    # Ensure that the root node will be always an internal node while is the
    # only node in the tree
    if number_of_splits == 0:
        internal_nodes = [0]

    # select the number of plot sub-figures
    row_plots = 3 if math.ceil(number_of_splits) > 4 else 2

    # set figure size
    fig = plt.figure(
        figsize=(4 * row_plots, 3.5 * math.ceil(number_of_splits / row_plots)),
    )

    # create grid for sub-figures
    gs = gridspec.GridSpec(
        math.ceil(number_of_splits / row_plots) * grid_size,
        row_plots * grid_size,
        fig,
    )

    hists = []
    hist_indicator = 0

    for i in range(number_of_splits):
        # Extract the subplot position
        row_from, row_to, col_from, col_to = util.grid_position(
            current=i,
            rows=row_plots,
            splits=number_of_splits,
            with_marginal=(with_hist or with_marginal_scatter),
        )

        # get the color of the sample that belong to the i internal node
        pr_col = sample_color[
            hipart_object.tree.get_node(internal_nodes[i]).data["indices"]
        ]

        # differentiate the visualizations which need explanatory margin plot
        # from simple visualizations
        if with_hist:
            principal_projections = dictionary_of_nodes[internal_nodes[i]].data[
                "projection"
            ]
            splitPoint = dictionary_of_nodes[internal_nodes[i]].data["splitpoint"]

            scatter = plt.subplot(gs[(row_from + 1): row_to, col_from:col_to])
            if i == 0:
                total_elements = (
                    hipart_object.tree.get_node(internal_nodes[i])
                    .data["indices"]
                    .shape[0]
                )

            hist = plt.subplot(
                gs[row_from: row_to - 3, col_from:col_to],
                sharex=scatter,
            )
            hists.append(hist)

            util.make_scatter_n_hist(
                scatter=scatter,
                hist=hist,
                PP=principal_projections,
                splitPoint=splitPoint,
                bandwidth_scale=hipart_object.bandwidth_scale,
                pr_col=pr_col,
                scaler=total_elements,
            )
            current_ylim = hist.get_ylim()
            hist.set_ylim(hists[hist_indicator].get_ylim())

            if hists[hist_indicator].get_ylim()[1] < current_ylim[1]:
                hist_indicator = i
                for j in hists:
                    j.set_ylim(current_ylim)

            if i == 0:
                hist.title.set_text(
                    "Original data with 1st split"
                )
            else:
                hist.title.set_text("Split no. " + str(i + 1))

        elif with_marginal_scatter:
            principal_projections = dictionary_of_nodes[internal_nodes[i]].data[
                "projection"
            ]
            splitPoint = dictionary_of_nodes[internal_nodes[i]].data["splitpoint"]

            scatter = plt.subplot(gs[(row_from + 1): row_to, col_from:col_to])
            marginal_scatter = plt.subplot(
                gs[row_from: row_to - 3, col_from:col_to], sharex=scatter
            )
            centers = (
                dictionary_of_nodes[internal_nodes[i]].data["centers"]
                if isinstance(hipart_object, KMPDDP)
                else None
            )
            util.make_scatter_n_marginal_scatter(
                scatter=scatter,
                marginal_scatter=marginal_scatter,
                PP=principal_projections,
                splitPoint=splitPoint,
                pr_col=pr_col,
                centers=centers,
            )

            if i == 0:
                marginal_scatter.title.set_text(
                    "Original data with 1st split"
                )
            else:
                marginal_scatter.title.set_text("Split no. " + str(i + 1))

        else:
            # Bisecting k-Means doesn't execute PCA, so we execute it here for
            # the visualization
            if isinstance(hipart_object, BisectingKmeans) or isinstance(
                hipart_object, MDH
            ):
                projeciton_vectors = util.execute_decomposition_method(
                    data_matrix=hipart_object.X[
                        dictionary_of_nodes[internal_nodes[i]].data["indices"]
                    ],
                    decomposition_method="pca",
                    two_dimentions=True,
                    decomposition_args={},
                )
                principal_projections = projeciton_vectors.fit_transform(
                    hipart_object.X[
                        dictionary_of_nodes[internal_nodes[i]].data["indices"]
                    ]
                )
                show_split = False
                splitPoint = None
            else:
                principal_projections = dictionary_of_nodes[internal_nodes[i]].data[
                    "projection"
                ]
                show_split = True
                splitPoint = dictionary_of_nodes[internal_nodes[i]].data["splitpoint"]

            # create each individual visualization
            ax = plt.subplot(gs[row_from:row_to, col_from:col_to])
            ax = util.make_simple_scatter(
                sp=ax,
                splitPoint=splitPoint,
                PP=principal_projections,
                pr_col=pr_col,
                show_split=show_split,
            )

            if i == 0:
                # title the subplot
                ax.title.set_text(
                    "Original data with 1st split"
                )
            else:
                ax.title.set_text("Split no. " + str(i + 1))

    plt.subplots_adjust(wspace=0.2, hspace=0.6)
    return plt




[docs]
def mdh_visualization(mdh_obj, color_map="viridis"):
    """
    Create the visualization of each of the splits generated by MDH algorithm,
    members of HiPart package. For each split, we visualize the data on the
    first two principal components, while the color of each sample is chosen
    depending on the cluster it belongs to. The colors throughout the separate
    split represent the same cluster each time.

    Moreover, we add the line of the projection vector projected onto the first
    two principal components, as well as a dashed line representing the
    splitting hyperplane. The splitting hyperplane a line that is perpendicular
    to the projection vector and passes through the minimum density point of the
    data on the projection direction. In addition, we create add the density of
    the data onto the projection vector in the form of a blue line, projected
    onto the projection vector. With this information one can validate the
    execution and the results of each algorithm and examining this visualization
    can help the parametrization of the algorithms.

    Warning: the projection vector and the splitting hyperplane in the plot may
    not seem perpendicular to each other. This is because the projection vector
    is projected onto the first two principal components, and we can't predict
    the viewing angle of the plot, in the tuple of dimensions. For the same
    reason, dotted line representing the splitting hyperplane may seem misplaced
    from the data it splits.

    Parameters
    ----------
    mdh_obj : MDH
        The object member of HiPart package that we want to manipulate on the
        premiss of this function.
    color_map : string
        The name of the matplotlib color map to be used for the data
        visualization.

    Raises
    ------
    TypeError
        If the hipart_object is not a member of the HiPart package raise an
        error for the possibility of unexpected errors that will be raised
        if the elements of the object are not correctly structured.
    ValueError
        If the "visualization_utility" attribute of the imported object is
        False, this causes some needed for this visualization data not
        to be created.

    Returns
    -------
    plt : pyplot (module)
        The created visualization by this function.

    """
    grid_size = 2

    # prepare the necessary data for this visualization
    (
        dictionary_of_nodes,
        internal_nodes,
        _,
        sample_color,
    ) = util.visualization_preparation(mdh_obj, color_map)
    number_of_splits = len(internal_nodes)

    # Ensure that the root node will be always an internal node while is the
    # only node in the tree
    if number_of_splits == 0:
        internal_nodes = [0]

    # select the number of plot sub-figures
    row_plots = 3 if math.ceil(number_of_splits) > 4 else 2

    # set figure size
    fig = plt.figure(
        figsize=(4 * row_plots, 3.5 * math.ceil(number_of_splits / row_plots)),
    )

    # create grid for sub-figures
    gs = gridspec.GridSpec(
        math.ceil(number_of_splits / row_plots) * grid_size,
        row_plots * grid_size,
        fig,
    )

    for i in range(number_of_splits):
        # Extract the subplot position
        row_from, row_to, col_from, col_to = util.grid_position(
            current=i,
            rows=row_plots,
            splits=number_of_splits,
            with_marginal=False,
        )

        # create each individual visualization
        ax = plt.subplot(gs[row_from:row_to, col_from:col_to])

        # get the color of the sample that belong to the i internal node
        pr_col = sample_color[mdh_obj.tree.get_node(internal_nodes[i]).data["indices"]]

        node_data = mdh_obj.X[dictionary_of_nodes[internal_nodes[i]].data["indices"]]
        node_data = (node_data - np.mean(node_data, 0)) / np.std(node_data, 0)

        pca = PCA(n_components=2)
        pcaproj = pca.fit_transform(node_data)
        v = dictionary_of_nodes[internal_nodes[i]].data["projection_vectors"]
        b = dictionary_of_nodes[internal_nodes[i]].data["splitpoint"]

        v_norm = np.linalg.norm(v)
        # convert v to unit vector
        v = v / v_norm

        pv = pca.transform(v.reshape(1, -1))[0]
        pv = pv / np.linalg.norm(pv)  # $$
        b = b / np.linalg.norm(pv)

        proj = np.dot(pcaproj, pv)
        h = util.band_const(mdh_obj.samples_number) * np.std(proj)

        x_grid = np.linspace(-3.5 * np.std(proj), 3.5 * np.std(proj), 1000)

        if np.abs(np.abs(pv[1]) - 1) <= 1.0e-8:
            ax.vlines(
                x=0, ymin=np.amin(x_grid), ymax=np.amax(x_grid), colors="cyan", zorder=2
            )
            phi = np.sign(pv[1]) * 0.5 * np.pi
        else:
            phi = np.arctan2(pv[1], pv[0])
            ax.plot(x_grid, np.tan(phi) * x_grid, c="cyan", zorder=2)

        ax.plot(np.cos(phi) * proj, np.sin(phi) * proj, c="r", alpha=0.7, zorder=3)

        ax.scatter(
            np.cos(phi) * b, np.sin(phi) * b, marker="o", c="r", alpha=1, zorder=4
        )
        ax.plot(
            x_grid,
            np.sin(phi) * b + np.tan(phi + np.pi / 2) * (x_grid - np.cos(phi) * b),
            ":r",
            zorder=4,
        )

        kde = KernelDensity(kernel="gaussian", bandwidth=h)
        kde.fit(proj[:, np.newaxis])
        out = np.exp(kde.score_samples(x_grid[:, np.newaxis]))

        # plot kde
        P = phi
        R = np.matrix([[np.cos(P), -np.sin(P)], [np.sin(P), np.cos(P)]])
        Y = np.matmul(R, np.row_stack((x_grid, out)))
        ax.plot(Y.A[0], Y.A[1], c="DarkBlue", lw=1, label="kde", zorder=4)

        ax.scatter(pcaproj[:, 0], pcaproj[:, 1], c=pr_col, s=5, marker=".", zorder=1)
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_xlim(
            min(pcaproj[:, 0]) - np.std(pcaproj[:, 0]) / 2,
            max(pcaproj[:, 0]) + np.std(pcaproj[:, 0]) / 2,
        )
        ax.set_ylim(
            min(pcaproj[:, 1]) - np.std(pcaproj[:, 1]) / 2,
            max(pcaproj[:, 1]) + np.std(pcaproj[:, 1]) / 2,
        )
        ax.grid(False)
        ax.margins(0.03)

        # title the subplot
        ax.title.set_text(
            "Original data with 1st split"
        ) if i == 0 else ax.title.set_text("Split no. " + str(i + 1))

    plt.subplots_adjust(wspace=0.2, hspace=0.6)
    return plt




[docs]
def dendrogram_visualization(hipart_object, cmap="viridis", default_coloring=True, **dendrogram_parameters):
    """
    Create a dendrogram visualization of the divisive clustering based on the
    HiPart`s algorithm execution. The characteristic of this dendrogram is that
    the distance of the clusters is not the one depicted on the graph, but it
    represents the distance of each node from the base of the tree, also known
    as leaves.

    !! Important Note !!
    The "count_sort" parameter is set to True by default to preserve the
    hierarchical split structure of the tree. If the "count_sort" parameter is
    set to False, the dendrogram colors will not be assigned to the correct
    clusters.

    Parameters
    ----------
    hipart_object : DePDDP or IPDDP or KMPDDP or PDDP or MDH or BisectingKmeans
        The object member of the HiPart package that we want to manipulate on
        the premiss of this function.
    cmap : string
        The name of the matplotlib color map to be used for the data
        visualization.
    default_coloring : bool, optional
        If True, the dendrogram will be colored according to the default HiPart
        tree coloring, based on the clustering implemented by the package. If
        False, the dendrogram will be colored according to the default
        methodology used by the `scipy.cluster.hierarchy.dendrogram` function.
        Note that either way the "color_threshold" parameter can be changed.
    **dendrogram_parameters : optional
        All the parameters the scipy.cluster.hierarchy.dendrogram function can
        take except the color_threshold parameter. Except for the
        "color_threshold" parameter. This parameter takes a default threshold
        because, in the linkage, we do not calculate the actual distance of the
        found clusters but only their hierarchy.

    Raises
    ------
    TypeError
        If the hipart_object is not a member of the HiPart package, the
        function raises a "TypeError" error. The error is raised for the
        possibility of unexpected faults if the elements of the hipart_object
        are not correctly structured.

    Returns
    -------
    dn : dict
        A dictionary of data structures created to render the dendrogram.

    """

    if not (
        isinstance(hipart_object, DePDDP)
        or isinstance(hipart_object, IPDDP)
        or isinstance(hipart_object, KMPDDP)
        or isinstance(hipart_object, PDDP)
        or isinstance(hipart_object, BisectingKmeans)
        or isinstance(hipart_object, MDH)
    ):
        raise TypeError("can only process objects of the 'HiPart' package.")

    if "color_threshold" in dendrogram_parameters:
        raise ValueError(
            """The color_threshold parameter is not allowed in this function.
            The color_threshold parameter takes a default threshold because, in
            the linkage, we do not calculate the actual distance of the found
            clusters but only their hierarchy."""
        )

    if "above_threshold_color" not in dendrogram_parameters:
        default = "C0"
    else:
        default = dendrogram_parameters["above_threshold_color"]

    # Initialize the color map based on the number of clusters
    color_map = matplotlib.cm.get_cmap(cmap, hipart_object.max_clusters_number)

    # Create the linkage and the color keys for the dendrogram
    Z, link_keys = util.create_linkage(hipart_object.tree, color_keys=True)

    # Create the dendrogram
    if default_coloring:
        # Create the default coloring link color function for the dendrogram
        dn = hierarchy.dendrogram(
            Z,
            link_color_func=lambda x: util.search_dict(link_keys, x, color_map, default),
            **dendrogram_parameters
        )
    else:
        # Create the default colors for the dendrogram function
        colors = np.array([util.rgba_to_hex(color_map(i)) for i in range(hipart_object.max_clusters_number)])
        # keys = np.array([i.data["color_key"] for i in hipart_object.tree.leaves()])
        # list(colors[keys]))
        hierarchy.set_link_color_palette(list(colors))
        dn = hierarchy.dendrogram(Z, color_threshold=0.3, **dendrogram_parameters)

    return dn




[docs]
def linkage(hipart_object):
    """
    Create the linkage of the data based on the divisive hierarchical
    clustering for the members of the HiParts package. The linkage's
    characteristic is that the cluster distance on the third column of the
    linkage is not the distance of the clusters with each other, but it
    represents the distance of each node from the base of the tree, also
    known as leaves.

    Parameters
    ----------
    hipart_object : DePDDP or IPDDP or KMPDDP or PDDP or MDH or BisectingKmeans
        The object member of HiPart package that we want to manipulate on the
        premiss of this function.

    Returns
    -------
    Z : numpy.ndarray
        The divisive clustering encoded as a linkage matrix.

    """

    if not (
        isinstance(hipart_object, DePDDP)
        or isinstance(hipart_object, IPDDP)
        or isinstance(hipart_object, KMPDDP)
        or isinstance(hipart_object, PDDP)
        or isinstance(hipart_object, BisectingKmeans)
        or isinstance(hipart_object, MDH)
    ):
        raise TypeError("can only process objects of the 'HiPart' package.")

    return util.create_linkage(hipart_object.tree)