MAGE tutorial : multiple monoview generators.

In this tutorial, we will learn how to use different monoview generators and configure them. Let us first define the vizualization functions :

[1]:

from sklearn.datasets import make_classification, make_gaussian_quantiles
import numpy as np
import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.colors import DEFAULT_PLOTLY_COLORS

from multiview_generator.gaussian_classes import MultiViewGaussianSubProblemsGenerator
from multiview_generator.sub_problems import *


def plot_3d(data):
    fig = go.Figure()
    for (label, class_data) in enumerate(data):
        fig.add_trace(go.Scatter3d(x=class_data[:, 0],
                                   y=class_data[:, 1],
                                   z=class_data[:, 2],
                                  name="Class {}".format(label+1),
                                  mode="markers",
                                  marker=dict(
                            size=1,)))
    return fig

def plot_2d(data):
    fig = go.Figure()
    for (label, class_data) in enumerate(data):
        fig.add_trace(go.Scatter(x=class_data[:, 0],
                                 y=class_data[:, 1],
                                  name="Class {}".format(label+1),
                                  mode="markers",
                                  marker=dict(
                            size=3,)))
    return fig

def plot_3d_4_views(generator, n_views=4, n_classes=3):
    fig = make_subplots(rows=2, cols=2,
                        subplot_titles= ["View {}".format(view_index)
                                         for view_index in range(n_views)],
                        specs=[[{'type': 'scatter3d'}, {'type': 'scatter3d'}, ],
                                   [{'type': 'scatter3d'},
                                    {'type': 'scatter3d'}, ]])
    row = 1
    col = 1
    show_legend = True
    # Plot the data for each view and each label
    for view_index in range(n_views):
        for lab_index in range(n_classes):
            concerned_examples = np.where(generator.y == lab_index)[0]
            fig.add_trace(
                go.Scatter3d(
                    x=generator.dataset[view_index][concerned_examples, 0],
                    y=generator.dataset[view_index][concerned_examples, 1],
                    z=generator.dataset[view_index][concerned_examples, 2],
                    text=[generator.sample_ids[ind] for ind in concerned_examples],
                    hoverinfo='text',
                    legendgroup="Class {}".format(lab_index),
                    mode='markers', marker=dict(size=1,
                                                color=DEFAULT_PLOTLY_COLORS[lab_index],
                                                opacity=0.8),
                    name="Class {}".format(lab_index),
                    showlegend=show_legend),
                row=row, col=col)
        show_legend = False
        col += 1
        if col == 3:
            col = 1
            row += 1
    return fig

StumpsGenerator : the hyper-cube vertices

The first and simplest generator we use is implemented in the StumpsGenerator class. It generates an hypercube and sets clusters of samples on its vertices for each class.

Let’s vizualize it in a 3D example :

[3]:

from IPython.display import display
from IPython.display import IFrame

configuration = {"class_sep":1, }
sub_problem_generator = StumpsGenerator(n_classes=3,
                                        n_features=2,
                                        random_vertices=True,
                                        errors=np.array([0.3,0.3, 0.3]),
                                        random_state=np.random.RandomState(42),
                                        n_samples_per_class=np.array([300,300,300]),
                                        **configuration)
data = sub_problem_generator.gen_data()
fig = plot_2d(data)

fig_path = os.path.join(supp_dir, "fig2.html")
plotly.offline.plot(fig, filename=fig_path, auto_open=False)
IFrame(src=fig_path , width=500, height=500)

[3]:

Here, we specified 2 features as it is the smallest amount of dimentions that can separate 3 classes, but if we specify more, the remaining dimensions will be filled with uniform noise :

[4]:

configuration = {"class_sep":1, }
sub_problem_generator = StumpsGenerator(n_classes=3,
                                        n_features=3,
                                        random_vertices=True,
                                        errors=np.array([0.3,0.3, 0.3]),
                                        random_state=np.random.RandomState(42),
                                        n_samples_per_class=np.array([300,300,300]),
                                        **configuration)
data = sub_problem_generator.gen_data()

fig3 = plot_3d(data)

fig3_path = os.path.join(supp_dir, "fig3.html")
plotly.offline.plot(fig3, filename=fig3_path, auto_open=False)
IFrame(src=fig3_path , width=500, height=500)

[4]:

RingsGenerator, the concentric spheres

This generator generates n_classes concentric (possibly multi_dimentional) spheres.

[5]:

configuration = {"class_sep":0.5, }
sub_problem_generator = RingsGenerator(n_classes=3,
                                        n_features=2,
                                        random_vertices=True,
                                        errors=np.array([0.2,0.2, 0.2]),
                                        random_state=np.random.RandomState(42),
                                        n_samples_per_class=np.array([300,300,300]),
                                        **configuration)
data = sub_problem_generator.gen_data()
fig4 = plot_2d(data)

fig4_path = os.path.join(supp_dir, "fig4.html")
plotly.offline.plot(fig4, filename=fig4_path, auto_open=False)
IFrame(src=fig4_path , width=500, height=500)

[5]:

This problem requires a more complex algorithm than a decision tree to be solved, an RBF-based one for example. And it can generate sub-problems with a large number of relevant features : if we plot the same dataset but with 3 features, it builds a sphere.

[6]:

configuration = {"class_sep":0.5, }
sub_problem_generator = RingsGenerator(n_classes=3,
                                        n_features=3,
                                        random_vertices=True,
                                        errors=np.array([0.2,0.2, 0.2]),
                                        random_state=np.random.RandomState(42),
                                        n_samples_per_class=np.array([300,300,300]),
                                        **configuration)
data = sub_problem_generator.gen_data()
fig5 = plot_3d(data)

fig5_path = os.path.join(supp_dir, "fig5.html")
plotly.offline.plot(fig5, filename=fig5_path, auto_open=False)
IFrame(src=fig5_path , width=500, height=500)

[6]:

MAGE usage : generator configuration

In order to use these generators in MAGE at their full potential, we need to set some configuration. First, let us initialize MAGE with the configuration of the proevious tutorial :

[7]:

name = "tuto"
n_views = 4
n_classes = 3
error_matrix = [
   [0.4, 0.4, 0.4, 0.4],
   [0.35, 0.4, 0.4, 0.4],
   [0.4, 0.4, 0.4, 0.4]
]
n_samples = 2000
n_features = 3
class_weights = [0.333, 0.333, 0.333,]
random_state = np.random.RandomState(42)
complementarity = 0.1
redundancy = 0.2
mutual_error = 0.01

Then, let’s configure the four monoview generators that will build our multiview dataset :

[8]:

sub_problem_type = ["StumpsGenerator",
                   "RingsGenerator",
                   "RingsGenerator",
                   "StumpsGenerator"]
sub_problem_configuration = [
    {"class_sep":1, },
    {"class_sep":1, },
    {"class_sep":1, },
    {"class_sep":1, },

]

Now let us generate the dataset

[9]:

generator = MultiViewGaussianSubProblemsGenerator(name=name, n_views=n_views,
                                          n_classes=n_classes,
                                          n_samples=n_samples,
                                          n_features=n_features,
                                          class_weights=class_weights,
                                          error_matrix=error_matrix,
                                          random_state=random_state,
                                          redundancy=redundancy,
                                          complementarity=complementarity,
                                          mutual_error=mutual_error,
                                          sub_problem_configurations=sub_problem_configuration,
                                          sub_problem_generators=sub_problem_type)

dataset, y = generator.generate_multi_view_dataset()

Let us plot the dataset views :

[10]:

fig6 = plot_3d_4_views(generator)
fig6_path = os.path.join(supp_dir, "fig6.html")
plotly.offline.plot(fig6, filename=fig6_path, auto_open=False)
IFrame(src=fig6_path , width=500, height=500)

[10]: