This notebook contains the code used to generate the sampling visualizations shown on the site. It is intended for exploration and modification rather than step-by-step instruction.

import math
import pandas as pd
import numpy as np
from plotly.subplots import make_subplots
import plotly.graph_objects as go

User Configuration

Parameters in this section can be modified to explore different sampling behaviors.

RANDOM_STATE = 42

counts = {
    "Red": 25,
    "Green": 25,
    "Blue": 25
}

color_map = {
    "Grey": "rgb(165, 165, 165)",
    "Red": "rgb(255, 0, 0)",
    "Green": "rgb(0, 180, 0)",
    "Blue": "rgb(0, 0, 255)"
}

sampling_config = {
    # --- Deterministic selection ---
    "Deterministic selection 100%": {
        "frac": 1.0,
        "is_subset": True,
        "replace": False
    },
    "Deterministic selection 50%": {
        "frac": 0.5,
        "is_subset": True,
        "replace": False
    },
    "Deterministic selection 25%": {
        "frac": .25,
        "is_subset": True,
        "replace": False
    },
    # --- Random selection (no replacement) ---
    "Random selection no replacement 100%": {
        "frac": 1.0,
        "is_subset": False,
        "replace": False
    },

    "Random selection no replacement 50%": {
        "frac": 0.5,
        "is_subset": False,
        "replace": False
    },
    "Random selection no replacement 25%": {
        "frac": .25,
        "is_subset": False,
        "replace": False
    },
    
    # --- Random selection (with replacement) ---
    "Random selection with replacement 100%": {
        "frac": 1.0,
        "is_subset": False,
        "replace": True
    },
    "Random selection with replacement 50%": {
        "frac": 0.5,
        "is_subset": False,
        "replace": True
    },
    "Random selection with replacement 25%": {
        "frac": .25,
        "is_subset": False,
        "replace": True
    },
}

SUBPLOT_PARAMS = {
    "rows": 3,
    "cols": 3,
    "horizontal_spacing": 0.02,
    "vertical_spacing": 0.08
}

plot_grid = [
    [
        "deterministic_selection_100",
        "random_selection_no_replacement_100",
        "random_selection_with_replacement_100",
    ],
    [
        "deterministic_selection_50",
        "random_selection_no_replacement_50",        
        "random_selection_with_replacement_50",
    ],
    [
        "deterministic_selection_25",
        "random_selection_no_replacement_25",
        "random_selection_with_replacement_25",
    ]
]

column_titles = [
    "DETERMINISTIC",
    "RANDOM<br>NO REPLACEMENT",
    "RANDOM<br>REPLACEMENT",
]
row_titles = ["100%", "50%", "25%"]

max_visible = 4  # cap for opacity scaling

Functions

Data Creation and Sampling

def make_population_df(
    counts: dict,
    x_centers: dict = None,
    x_jitter: float = 0.08,
    y_jitter: float = 0.6,
    seed: int = 42
) -> pd.DataFrame:
    """
    Create a population DataFrame where each row represents a single sample.

    Columns:
    - color: category label
    - color_index: unique index within each color (0-based)
    - global_index: unique index across entire population
    - x, y: fixed spatial coordinates for visualization
    """
    rng = np.random.default_rng(seed)

    if x_centers is None:
        x_centers = {
            "Red": -.5,
            "Green": 0.0,
            "Blue": .5
        }

    rows = []

    for color, n in counts.items():
        for i in range(n):
            rows.append({
                "color": color,
                "color_index": i,
                "x": x_centers[color] + rng.normal(0, x_jitter),
                "y": rng.normal(0, y_jitter)
            })

    df = pd.DataFrame(rows)
    df["global_index"] = range(len(df))

    return df

def compute_draw_count(
    df_population: pd.DataFrame,
    frac: float,
    is_subset: bool,
    replace: bool,
    random_state: int | None = None
) -> pd.Series:
    """
    Compute per-sample draw counts for a given sampling configuration.

    Returns
    -------
    pd.Series
        Integer draw counts indexed like df_population.
    """
    n_total = int(len(df_population) * frac)

    # Initialize all counts to zero
    draw_count = pd.Series(0, index=df_population.index)

    if is_subset:
        # Deterministic: per-color proportional truncation
        cutoffs = (
            df_population.groupby("color")["color_index"]
            .max()
            .add(1)
            .mul(frac)
            .apply(np.ceil)
            .astype(int)
        )

        selected_idx = df_population.index[
            df_population["color_index"] < df_population["color"].map(cutoffs)
        ]

        draw_count.loc[selected_idx] = 1

    else:
        # Random sampling (with or without replacement)
        sampled_idx = df_population.sample(
            n=n_total,
            replace=replace,
            random_state=random_state
        ).index

        counts = sampled_idx.value_counts()
        draw_count.loc[counts.index] = counts.values

    return draw_count

def aggregate_color_rgb(df, draw_col, color_map):
    """
    Compute weighted average RGB color for a given draw_count column.
    """
    weights = df[draw_col].values

    if weights.sum() == 0:
        return "rgb(230,230,230)"  # fallback, should rarely happen

    rgb = np.zeros(3)

    for color, rgb_str in color_map.items():
        if color == "Grey":
            continue

        r, g, b = map(int, rgb_str[4:-1].split(","))
        mask = df["color"] == color
        rgb += np.array([r, g, b]) * weights[mask].sum()

    rgb = (rgb / weights.sum()).astype(int)
    return f"rgb({rgb[0]},{rgb[1]},{rgb[2]})"

Plotly Visuals

def create_subplots_fig(params):
    return make_subplots(**params)

def add_sample_selection_grid(
    fig,
    df,
    plot_grid,
    color_map,
    max_draw_cap=4,
    grey_opacity=0.4,
    base_marker_size=8,
):
    """
    Add scatter traces showing which samples were selected
    under each sampling configuration.
    """
    for r, row_cols in enumerate(plot_grid, start=1):
        for c, draw_col in enumerate(row_cols, start=1):

            draw = df[draw_col]

            plot_color = np.where(draw > 0, df["color"], "Grey")

            opacity = np.where(
                draw > 0,
                0.4 + 0.6 * np.minimum(draw, max_draw_cap) / max_draw_cap,
                grey_opacity,
            )

            # Plot grey first, then RGB
            for color_name in ["Grey", "Red", "Green", "Blue"]:
                mask = plot_color == color_name
                if not mask.any():
                    continue

                fig.add_trace(
                    go.Scatter(
                        x=df.loc[mask, "x"],
                        y=df.loc[mask, "y"],
                        mode="markers",
                        marker=dict(
                            size=base_marker_size,
                            color=color_map[color_name],
                            opacity=opacity[mask],
                        ),
                        showlegend=False,
                    ),
                    row=r,
                    col=c,
                )

def add_aggregate_color_grid(
    fig,
    df,
    plot_grid,
    color_map,
    marker_size=140,
    outline_color="rgba(0,0,0,0.45)",
    outline_width=1.5,
):
    """
    Add one aggregate color marker per subplot based on draw counts.
    """
    for r, row_cols in enumerate(plot_grid, start=1):
        for c, draw_col in enumerate(row_cols, start=1):

            agg_color = aggregate_color_rgb(df, draw_col, color_map)

            fig.add_trace(
                go.Scatter(
                    x=[0],
                    y=[0],
                    mode="markers",
                    marker=dict(
                        size=marker_size,
                        color=agg_color,
                        line=dict(
                            color=outline_color,
                            width=outline_width,
                        ),
                    ),
                    showlegend=False,
                ),
                row=r,
                col=c,
            )

def hide_subplot_axes(fig, n_rows, n_cols, axis_range=(-1, 1)):
    """
    Hide axes and set consistent ranges across all subplots.
    """
    for r in range(1, n_rows + 1):
        for c in range(1, n_cols + 1):
            fig.update_xaxes(
                visible=False,
                range=list(axis_range),
                row=r,
                col=c,
            )
            fig.update_yaxes(
                visible=False,
                range=list(axis_range),
                row=r,
                col=c,
            )

def add_column_labels(
    fig,
    column_titles,
    n_cols,
    y=1.05,
    font_size=14,
):
    """
    Add column titles centered above each column.
    """
    for i, title in enumerate(column_titles):
        x_position = (i + 0.5) / n_cols 

        fig.add_annotation(
            text=f"<b>{title}</b>",
            xref="paper",
            yref="paper",
            x=x_position,
            y=y,
            showarrow=False,
            font=dict(size=font_size),
            xanchor='center',
        )

def add_row_labels(
    fig,
    row_titles,
    n_rows,
    x=-0.05,
    font_size=16,
):
    """
    Add row labels, left-aligned.
    """
    for i, title in enumerate(row_titles):
        y_position = 1 - (i + 0.5) / n_rows 

        fig.add_annotation(
            text=f"<b>{title}</b>",
            xref="paper",
            yref="paper",
            x=x,
            y=y_position,
            showarrow=False,
            font=dict(size=font_size),
            yanchor='middle',
        )

def add_title(fig, title=None):
    if title is not None:
        fig.update_layout(
            title=dict(
                text=f"<b>{title}</b>",
                x=0.5,
                xanchor="center",
                font=dict(size=18),
                y=.98
            )
        )

def apply_shared_layout(
    fig,
    height=900,
    width=900,
    margins=dict(l=60, r=20, t=80, b=40),
    background_color="white",
):
    """
    Apply shared layout settings for grid-based figures.
    """
    fig.update_layout(
        height=height,
        width=width,
        margin=margins,
        plot_bgcolor=background_color,
    )

def apply_grid_formatting(
    fig,
    n_rows,
    n_cols,
    row_titles,
    column_titles,
    title=None,
):
    """
    Apply all shared formatting to a grid-based Plotly figure.
    """
    hide_subplot_axes(fig, n_rows, n_cols)

    add_column_labels(fig, column_titles, n_cols)
    add_row_labels(fig, row_titles, n_rows)
    add_title(fig, title)

    apply_shared_layout(fig)

Population & Sampling

df = make_population_df(counts, seed=RANDOM_STATE)

df.head()

	color	color_index	x	y	global_index
0	Red	0	-0.475623	-0.623990	0
1	Red	1	-0.439964	0.564339	1
2	Red	2	-0.656083	-0.781308	2
3	Red	3	-0.489773	-0.189746	3
4	Red	4	-0.501344	-0.511826	4

df_population = df.copy()

for label, cfg in sampling_config.items():
    colname = label.lower().replace(" ", "_").replace("%", "")
    df[colname] = compute_draw_count(
        df_population=df_population,
        frac=cfg["frac"],
        is_subset=cfg["is_subset"],
        replace=cfg["replace"],
        random_state=RANDOM_STATE
    )

Visualization

Sample Scatterplots

fig1 = create_subplots_fig(SUBPLOT_PARAMS)
add_sample_selection_grid(fig1, df, plot_grid, color_map)

apply_grid_formatting(
    fig1,
    n_rows=3,
    n_cols=3,
    row_titles=row_titles,
    column_titles=column_titles,
    title="Sample Selection by Method and Size",
)

Color Aggregation

fig2 = create_subplots_fig(SUBPLOT_PARAMS)
add_aggregate_color_grid(fig2, df, plot_grid, color_map)

apply_grid_formatting(
    fig2,
    n_rows=3,
    n_cols=3,
    row_titles=row_titles,
    column_titles=column_titles,
    title="Aggregated Outcome",
)

Final Plots

fig1.show()

fig2.show()