# Copyright (c) 2022 Panagiotis Anagnostou
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""
Implementation module for the static visualization of the algorithms implemented
in the HiPart package.
@author: Panagiotis Anagnostou
@author: Nicos Pavlidis
"""
import matplotlib
import HiPart.__utility_functions as util
import math
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import numpy as np
from HiPart.clustering import BisectingKmeans
from HiPart.clustering import DePDDP
from HiPart.clustering import IPDDP
from HiPart.clustering import KMPDDP
from HiPart.clustering import PDDP
from HiPart.clustering import MDH
from scipy.cluster import hierarchy
from sklearn.decomposition import PCA
from sklearn.neighbors import KernelDensity
[docs]
def split_visualization(hipart_object, color_map="viridis", mdh_split_plot=True):
"""
Create the visualization of each of the splits generated by one of the
divisive hierarchical clustering algorithms, members of HiPart package.
For each split, we visualize the data on the first two principal
components, while the color of each sample is chosen depending on the
cluster it belongs to. The colors throughout the separate split represent
the same cluster each time.
Depending on the input object, the visualization is enriched with
additional information related to the algorithm that created it. For the:
1. DePDDP object, the visualization adds a marginal plot on the X-axis that
represents the density of the data as extracted from the kernel density
estimation for the first principal component. That is the information the
dePDDP algorithm utilizes to split each cluster.
2. KMPDDP object, the visualization adds a marginal plot on the X-axis
that represents the data as they are projected on the first principal
component, with the addition of the centers the k-Means finds within its
execution on each split. That is the information the kM_PDDP algorithm
utilizes to split each cluster.
3. iPDDP along with PDDP objects, the visualization adds a marginal plot on
the X-axis that represents the data as they are projected on the first
principal component. That is the information the iPDDP algorithm utilizes
to split each cluster.
4. BisectingKmeans object does not include additional information in the
visualization. That is because of the nature of the algorithm it
implements.
5. MDH object does not include additional information in the visualization.
That is because of the nature of the algorithm it implements.
Finally, for all the objects the visualization adds the selected split
point for each split by each algorithm. This way one can validate the
execution and the results of each algorithm. Moreover, examining this
visualization can help the parametrization of the algorithms.
Parameters
----------
hipart_object : DePDDP or IPDDP or KMPDDP or PDDP or MDH or BisectingKmeans
The object member of HiPart package that we want to manipulate on the
premiss of this function.
color_map : string
The name of the matplotlib color map to be used for the data
visualization.
mdh_split_plot : bool
If True, the visualization of the MDH object will include the projection
vector and the splitting hyperplane visualization. See
visualization.mdh_visualization for more information.
Raises
------
TypeError
If the hipart_object is not a member of the HiPart package raise an
error for the possibility of unexpected errors that will be raised
if the elements of the object are not correctly structured.
ValueError
If the "visualization_utility" attribute of the imported object is
False, this causes some needed for this visualization data not
to be created.
Returns
-------
plt : pyplot (module)
The created visualization by this function.
"""
if isinstance(hipart_object, MDH) and mdh_split_plot:
return mdh_visualization(hipart_object, color_map)
# Check data compatibility with the function
if isinstance(hipart_object, DePDDP):
if not hipart_object.visualization_utility:
raise ValueError(
"""The visualization of the data cannot be archived because the
visualization_utility is False."""
)
with_hist = True
with_marginal_scatter = False
grid_size = 4
elif (
isinstance(hipart_object, IPDDP)
or isinstance(hipart_object, KMPDDP)
or isinstance(hipart_object, PDDP)
):
if not hipart_object.visualization_utility:
raise ValueError(
"""The visualization of the data cannot be archived because the
visualization_utility is False."""
)
with_hist = False
with_marginal_scatter = True
grid_size = 4
elif isinstance(hipart_object, BisectingKmeans) or isinstance(hipart_object, MDH):
with_hist = False
with_marginal_scatter = False
grid_size = 2
else:
raise TypeError("can only process objects of the 'HiPart' package.")
# prepare the necessary data for this visualization
(
dictionary_of_nodes,
internal_nodes,
_,
sample_color,
) = util.visualization_preparation(hipart_object, color_map)
number_of_splits = len(internal_nodes)
# Ensure that the root node will be always an internal node while is the
# only node in the tree
if number_of_splits == 0:
internal_nodes = [0]
# select the number of plot sub-figures
row_plots = 3 if math.ceil(number_of_splits) > 4 else 2
# set figure size
fig = plt.figure(
figsize=(4 * row_plots, 3.5 * math.ceil(number_of_splits / row_plots)),
)
# create grid for sub-figures
gs = gridspec.GridSpec(
math.ceil(number_of_splits / row_plots) * grid_size,
row_plots * grid_size,
fig,
)
hists = []
hist_indicator = 0
for i in range(number_of_splits):
# Extract the subplot position
row_from, row_to, col_from, col_to = util.grid_position(
current=i,
rows=row_plots,
splits=number_of_splits,
with_marginal=(with_hist or with_marginal_scatter),
)
# get the color of the sample that belong to the i internal node
pr_col = sample_color[
hipart_object.tree.get_node(internal_nodes[i]).data["indices"]
]
# differentiate the visualizations which need explanatory margin plot
# from simple visualizations
if with_hist:
principal_projections = dictionary_of_nodes[internal_nodes[i]].data[
"projection"
]
splitPoint = dictionary_of_nodes[internal_nodes[i]].data["splitpoint"]
scatter = plt.subplot(gs[(row_from + 1): row_to, col_from:col_to])
if i == 0:
total_elements = (
hipart_object.tree.get_node(internal_nodes[i])
.data["indices"]
.shape[0]
)
hist = plt.subplot(
gs[row_from: row_to - 3, col_from:col_to],
sharex=scatter,
)
hists.append(hist)
util.make_scatter_n_hist(
scatter=scatter,
hist=hist,
PP=principal_projections,
splitPoint=splitPoint,
bandwidth_scale=hipart_object.bandwidth_scale,
pr_col=pr_col,
scaler=total_elements,
)
current_ylim = hist.get_ylim()
hist.set_ylim(hists[hist_indicator].get_ylim())
if hists[hist_indicator].get_ylim()[1] < current_ylim[1]:
hist_indicator = i
for j in hists:
j.set_ylim(current_ylim)
if i == 0:
hist.title.set_text(
"Original data with 1st split"
)
else:
hist.title.set_text("Split no. " + str(i + 1))
elif with_marginal_scatter:
principal_projections = dictionary_of_nodes[internal_nodes[i]].data[
"projection"
]
splitPoint = dictionary_of_nodes[internal_nodes[i]].data["splitpoint"]
scatter = plt.subplot(gs[(row_from + 1): row_to, col_from:col_to])
marginal_scatter = plt.subplot(
gs[row_from: row_to - 3, col_from:col_to], sharex=scatter
)
centers = (
dictionary_of_nodes[internal_nodes[i]].data["centers"]
if isinstance(hipart_object, KMPDDP)
else None
)
util.make_scatter_n_marginal_scatter(
scatter=scatter,
marginal_scatter=marginal_scatter,
PP=principal_projections,
splitPoint=splitPoint,
pr_col=pr_col,
centers=centers,
)
if i == 0:
marginal_scatter.title.set_text(
"Original data with 1st split"
)
else:
marginal_scatter.title.set_text("Split no. " + str(i + 1))
else:
# Bisecting k-Means doesn't execute PCA, so we execute it here for
# the visualization
if isinstance(hipart_object, BisectingKmeans) or isinstance(
hipart_object, MDH
):
projeciton_vectors = util.execute_decomposition_method(
data_matrix=hipart_object.X[
dictionary_of_nodes[internal_nodes[i]].data["indices"]
],
decomposition_method="pca",
two_dimentions=True,
decomposition_args={},
)
principal_projections = projeciton_vectors.fit_transform(
hipart_object.X[
dictionary_of_nodes[internal_nodes[i]].data["indices"]
]
)
show_split = False
splitPoint = None
else:
principal_projections = dictionary_of_nodes[internal_nodes[i]].data[
"projection"
]
show_split = True
splitPoint = dictionary_of_nodes[internal_nodes[i]].data["splitpoint"]
# create each individual visualization
ax = plt.subplot(gs[row_from:row_to, col_from:col_to])
ax = util.make_simple_scatter(
sp=ax,
splitPoint=splitPoint,
PP=principal_projections,
pr_col=pr_col,
show_split=show_split,
)
if i == 0:
# title the subplot
ax.title.set_text(
"Original data with 1st split"
)
else:
ax.title.set_text("Split no. " + str(i + 1))
plt.subplots_adjust(wspace=0.2, hspace=0.6)
return plt
[docs]
def mdh_visualization(mdh_obj, color_map="viridis"):
"""
Create the visualization of each of the splits generated by MDH algorithm,
members of HiPart package. For each split, we visualize the data on the
first two principal components, while the color of each sample is chosen
depending on the cluster it belongs to. The colors throughout the separate
split represent the same cluster each time.
Moreover, we add the line of the projection vector projected onto the first
two principal components, as well as a dashed line representing the
splitting hyperplane. The splitting hyperplane a line that is perpendicular
to the projection vector and passes through the minimum density point of the
data on the projection direction. In addition, we create add the density of
the data onto the projection vector in the form of a blue line, projected
onto the projection vector. With this information one can validate the
execution and the results of each algorithm and examining this visualization
can help the parametrization of the algorithms.
Warning: the projection vector and the splitting hyperplane in the plot may
not seem perpendicular to each other. This is because the projection vector
is projected onto the first two principal components, and we can't predict
the viewing angle of the plot, in the tuple of dimensions. For the same
reason, dotted line representing the splitting hyperplane may seem misplaced
from the data it splits.
Parameters
----------
mdh_obj : MDH
The object member of HiPart package that we want to manipulate on the
premiss of this function.
color_map : string
The name of the matplotlib color map to be used for the data
visualization.
Raises
------
TypeError
If the hipart_object is not a member of the HiPart package raise an
error for the possibility of unexpected errors that will be raised
if the elements of the object are not correctly structured.
ValueError
If the "visualization_utility" attribute of the imported object is
False, this causes some needed for this visualization data not
to be created.
Returns
-------
plt : pyplot (module)
The created visualization by this function.
"""
grid_size = 2
# prepare the necessary data for this visualization
(
dictionary_of_nodes,
internal_nodes,
_,
sample_color,
) = util.visualization_preparation(mdh_obj, color_map)
number_of_splits = len(internal_nodes)
# Ensure that the root node will be always an internal node while is the
# only node in the tree
if number_of_splits == 0:
internal_nodes = [0]
# select the number of plot sub-figures
row_plots = 3 if math.ceil(number_of_splits) > 4 else 2
# set figure size
fig = plt.figure(
figsize=(4 * row_plots, 3.5 * math.ceil(number_of_splits / row_plots)),
)
# create grid for sub-figures
gs = gridspec.GridSpec(
math.ceil(number_of_splits / row_plots) * grid_size,
row_plots * grid_size,
fig,
)
for i in range(number_of_splits):
# Extract the subplot position
row_from, row_to, col_from, col_to = util.grid_position(
current=i,
rows=row_plots,
splits=number_of_splits,
with_marginal=False,
)
# create each individual visualization
ax = plt.subplot(gs[row_from:row_to, col_from:col_to])
# get the color of the sample that belong to the i internal node
pr_col = sample_color[mdh_obj.tree.get_node(internal_nodes[i]).data["indices"]]
node_data = mdh_obj.X[dictionary_of_nodes[internal_nodes[i]].data["indices"]]
node_data = (node_data - np.mean(node_data, 0)) / np.std(node_data, 0)
pca = PCA(n_components=2)
pcaproj = pca.fit_transform(node_data)
v = dictionary_of_nodes[internal_nodes[i]].data["projection_vectors"]
b = dictionary_of_nodes[internal_nodes[i]].data["splitpoint"]
v_norm = np.linalg.norm(v)
# convert v to unit vector
v = v / v_norm
pv = pca.transform(v.reshape(1, -1))[0]
pv = pv / np.linalg.norm(pv) # $$
b = b / np.linalg.norm(pv)
proj = np.dot(pcaproj, pv)
h = util.band_const(mdh_obj.samples_number) * np.std(proj)
x_grid = np.linspace(-3.5 * np.std(proj), 3.5 * np.std(proj), 1000)
if np.abs(np.abs(pv[1]) - 1) <= 1.0e-8:
ax.vlines(
x=0, ymin=np.amin(x_grid), ymax=np.amax(x_grid), colors="cyan", zorder=2
)
phi = np.sign(pv[1]) * 0.5 * np.pi
else:
phi = np.arctan2(pv[1], pv[0])
ax.plot(x_grid, np.tan(phi) * x_grid, c="cyan", zorder=2)
ax.plot(np.cos(phi) * proj, np.sin(phi) * proj, c="r", alpha=0.7, zorder=3)
ax.scatter(
np.cos(phi) * b, np.sin(phi) * b, marker="o", c="r", alpha=1, zorder=4
)
ax.plot(
x_grid,
np.sin(phi) * b + np.tan(phi + np.pi / 2) * (x_grid - np.cos(phi) * b),
":r",
zorder=4,
)
kde = KernelDensity(kernel="gaussian", bandwidth=h)
kde.fit(proj[:, np.newaxis])
out = np.exp(kde.score_samples(x_grid[:, np.newaxis]))
# plot kde
P = phi
R = np.matrix([[np.cos(P), -np.sin(P)], [np.sin(P), np.cos(P)]])
Y = np.matmul(R, np.row_stack((x_grid, out)))
ax.plot(Y.A[0], Y.A[1], c="DarkBlue", lw=1, label="kde", zorder=4)
ax.scatter(pcaproj[:, 0], pcaproj[:, 1], c=pr_col, s=5, marker=".", zorder=1)
ax.set_xticks([])
ax.set_yticks([])
ax.set_xlim(
min(pcaproj[:, 0]) - np.std(pcaproj[:, 0]) / 2,
max(pcaproj[:, 0]) + np.std(pcaproj[:, 0]) / 2,
)
ax.set_ylim(
min(pcaproj[:, 1]) - np.std(pcaproj[:, 1]) / 2,
max(pcaproj[:, 1]) + np.std(pcaproj[:, 1]) / 2,
)
ax.grid(False)
ax.margins(0.03)
# title the subplot
ax.title.set_text(
"Original data with 1st split"
) if i == 0 else ax.title.set_text("Split no. " + str(i + 1))
plt.subplots_adjust(wspace=0.2, hspace=0.6)
return plt
[docs]
def dendrogram_visualization(hipart_object, cmap="viridis", default_coloring=True, **dendrogram_parameters):
"""
Create a dendrogram visualization of the divisive clustering based on the
HiPart`s algorithm execution. The characteristic of this dendrogram is that
the distance of the clusters is not the one depicted on the graph, but it
represents the distance of each node from the base of the tree, also known
as leaves.
!! Important Note !!
The "count_sort" parameter is set to True by default to preserve the
hierarchical split structure of the tree. If the "count_sort" parameter is
set to False, the dendrogram colors will not be assigned to the correct
clusters.
Parameters
----------
hipart_object : DePDDP or IPDDP or KMPDDP or PDDP or MDH or BisectingKmeans
The object member of the HiPart package that we want to manipulate on
the premiss of this function.
cmap : string
The name of the matplotlib color map to be used for the data
visualization.
default_coloring : bool, optional
If True, the dendrogram will be colored according to the default HiPart
tree coloring, based on the clustering implemented by the package. If
False, the dendrogram will be colored according to the default
methodology used by the `scipy.cluster.hierarchy.dendrogram` function.
Note that either way the "color_threshold" parameter can be changed.
**dendrogram_parameters : optional
All the parameters the scipy.cluster.hierarchy.dendrogram function can
take except the color_threshold parameter. Except for the
"color_threshold" parameter. This parameter takes a default threshold
because, in the linkage, we do not calculate the actual distance of the
found clusters but only their hierarchy.
Raises
------
TypeError
If the hipart_object is not a member of the HiPart package, the
function raises a "TypeError" error. The error is raised for the
possibility of unexpected faults if the elements of the hipart_object
are not correctly structured.
Returns
-------
dn : dict
A dictionary of data structures created to render the dendrogram.
"""
if not (
isinstance(hipart_object, DePDDP)
or isinstance(hipart_object, IPDDP)
or isinstance(hipart_object, KMPDDP)
or isinstance(hipart_object, PDDP)
or isinstance(hipart_object, BisectingKmeans)
or isinstance(hipart_object, MDH)
):
raise TypeError("can only process objects of the 'HiPart' package.")
if "color_threshold" in dendrogram_parameters:
raise ValueError(
"""The color_threshold parameter is not allowed in this function.
The color_threshold parameter takes a default threshold because, in
the linkage, we do not calculate the actual distance of the found
clusters but only their hierarchy."""
)
if "above_threshold_color" not in dendrogram_parameters:
default = "C0"
else:
default = dendrogram_parameters["above_threshold_color"]
# Initialize the color map based on the number of clusters
color_map = matplotlib.cm.get_cmap(cmap, hipart_object.max_clusters_number)
# Create the linkage and the color keys for the dendrogram
Z, link_keys = util.create_linkage(hipart_object.tree, color_keys=True)
# Create the dendrogram
if default_coloring:
# Create the default coloring link color function for the dendrogram
dn = hierarchy.dendrogram(
Z,
link_color_func=lambda x: util.search_dict(link_keys, x, color_map, default),
**dendrogram_parameters
)
else:
# Create the default colors for the dendrogram function
colors = np.array([util.rgba_to_hex(color_map(i)) for i in range(hipart_object.max_clusters_number)])
# keys = np.array([i.data["color_key"] for i in hipart_object.tree.leaves()])
# list(colors[keys]))
hierarchy.set_link_color_palette(list(colors))
dn = hierarchy.dendrogram(Z, color_threshold=0.3, **dendrogram_parameters)
return dn
[docs]
def linkage(hipart_object):
"""
Create the linkage of the data based on the divisive hierarchical
clustering for the members of the HiParts package. The linkage's
characteristic is that the cluster distance on the third column of the
linkage is not the distance of the clusters with each other, but it
represents the distance of each node from the base of the tree, also
known as leaves.
Parameters
----------
hipart_object : DePDDP or IPDDP or KMPDDP or PDDP or MDH or BisectingKmeans
The object member of HiPart package that we want to manipulate on the
premiss of this function.
Returns
-------
Z : numpy.ndarray
The divisive clustering encoded as a linkage matrix.
"""
if not (
isinstance(hipart_object, DePDDP)
or isinstance(hipart_object, IPDDP)
or isinstance(hipart_object, KMPDDP)
or isinstance(hipart_object, PDDP)
or isinstance(hipart_object, BisectingKmeans)
or isinstance(hipart_object, MDH)
):
raise TypeError("can only process objects of the 'HiPart' package.")
return util.create_linkage(hipart_object.tree)