spatial_graph_algorithms.compare

Helpers for comparative simulation and reconstruction studies. run_comparison() returns a ComparisonResult with built-in summary, ranking, plotting, and I/O.

API Reference

`spatial_graph_algorithms.compare.ComparisonResult` `dataclass`

Results of a multi-method reconstruction comparison study.

Wraps the raw tidy DataFrame produced by :func:run_comparison and exposes convenience methods for summarising, ranking, and plotting without boilerplate pandas.

The raw DataFrame is always accessible via :attr:df.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	One row per `graph_spec × seed × reconstruction_spec`.	required

Examples:

>>> from spatial_graph_algorithms.compare import parameter_grid, run_comparison
>>> graphs = parameter_grid(cases=[{"n": 50, "mode": "knn", "k": 4}])
>>> recons = parameter_grid(cases=[{"method": "mds"}])
>>> result = run_comparison(graph_specs=graphs, reconstruction_specs=recons, seeds=[1])
>>> isinstance(result.df, pd.DataFrame)
True

Source code in src/spatial_graph_algorithms/compare/__init__.py

@dataclass
class ComparisonResult:
    """Results of a multi-method reconstruction comparison study.

    Wraps the raw tidy DataFrame produced by :func:`run_comparison` and
    exposes convenience methods for summarising, ranking, and plotting
    without boilerplate pandas.

    The raw DataFrame is always accessible via :attr:`df`.

    Parameters
    ----------
    df : pandas.DataFrame
        One row per ``graph_spec × seed × reconstruction_spec``.

    Examples
    --------
    >>> from spatial_graph_algorithms.compare import parameter_grid, run_comparison
    >>> graphs = parameter_grid(cases=[{"n": 50, "mode": "knn", "k": 4}])
    >>> recons = parameter_grid(cases=[{"method": "mds"}])
    >>> result = run_comparison(graph_specs=graphs, reconstruction_specs=recons, seeds=[1])
    >>> isinstance(result.df, pd.DataFrame)
    True
    """

    df: pd.DataFrame

    # ------------------------------------------------------------------
    # Analysis helpers
    # ------------------------------------------------------------------

    def summary(
        self,
        *,
        by: list[str] | None = None,
        metrics: list[str] | None = None,
    ) -> pd.DataFrame:
        """Return mean metrics grouped by method and graph condition.

        Only rows with ``status == "ok"`` are included.

        Parameters
        ----------
        by : list of str, optional
            Columns to group by.  Default is ``["graph_label", "method"]``.
        metrics : list of str, optional
            Metric columns to aggregate.  Default is ``["cpd", "knn"]``.

        Returns
        -------
        pandas.DataFrame
            Mean of each metric for each group.  Groups with no successful
            rows are absent.

        Examples
        --------
        >>> result.summary()  # doctest: +SKIP
                                         cpd    knn
        graph_label  method
        mode=knn__k=4  landmark_mds  0.8821  0.7341
                       mds           0.7512  0.6103
        """
        by = list(by) if by is not None else _DEFAULT_SUMMARY_BY
        metrics = list(metrics) if metrics is not None else _DEFAULT_METRICS
        ok = self.df[self.df["status"] == "ok"]
        present = [m for m in metrics if m in ok.columns]
        if not present:
            return pd.DataFrame()
        return ok.groupby(by)[present].mean().round(4)

    def best(
        self,
        *,
        metric: str = "cpd",
        by: list[str] | None = None,
        higher_is_better: bool = True,
    ) -> pd.DataFrame:
        """Return the best-performing method per group.

        Parameters
        ----------
        metric : str
            Metric column to rank by.  Default is ``"cpd"``.
        by : list of str, optional
            Grouping columns.  Default is ``["graph_label"]``.
        higher_is_better : bool
            If ``True`` (default), select the row with the highest metric
            value.  Set to ``False`` for error / loss metrics.

        Returns
        -------
        pandas.DataFrame
            One row per unique ``by`` group, showing the best method and its
            mean metric value (averaged over seeds).

        Raises
        ------
        ValueError
            If ``metric`` is not a column in the results DataFrame.

        Examples
        --------
        >>> result.best(metric="cpd")  # doctest: +SKIP
              graph_label        method     cpd
        0  mode=knn__k=4  landmark_mds  0.8821
        """
        by_cols = list(by) if by is not None else _DEFAULT_BEST_BY
        ok = self.df[self.df["status"] == "ok"]
        if metric not in ok.columns:
            available = [c for c in ok.columns if ok[c].dtype.kind == "f"]
            raise ValueError(
                f"Metric {metric!r} not found in results.  "
                f"Available numeric columns: {available}"
            )
        grp_cols = by_cols + ["method"]
        agg = ok.groupby(grp_cols)[metric].mean().reset_index()
        if higher_is_better:
            idx = agg.groupby(by_cols)[metric].idxmax()
        else:
            idx = agg.groupby(by_cols)[metric].idxmin()
        return agg.loc[idx.values].reset_index(drop=True)

    # ------------------------------------------------------------------
    # Plotting
    # ------------------------------------------------------------------

    def plot(
        self,
        *,
        metric: str = "cpd",
        by: str = "method",
        hue: str | None = "graph_label",
        ax: plt.Axes | None = None,
    ) -> plt.Figure:
        """Bar chart of a quality metric grouped by method and condition.

        Means are computed over all successful rows (``status == "ok"``).
        Error bars show one standard deviation.

        Parameters
        ----------
        metric : str
            Metric column to plot.  Default is ``"cpd"``.
        by : str
            Column that defines the x-axis categories.  Default is
            ``"method"``.
        hue : str, optional
            Column that defines the colour grouping.  Default is
            ``"graph_label"``.  Pass ``None`` for a single-colour chart.
        ax : matplotlib.axes.Axes, optional
            Axes to draw on.  A new figure is created when omitted.

        Returns
        -------
        matplotlib.figure.Figure

        Raises
        ------
        ValueError
            If ``metric`` is not a column in the results DataFrame.

        Examples
        --------
        >>> fig = result.plot(metric="cpd", by="method")  # doctest: +SKIP
        """
        import matplotlib.pyplot as plt

        ok = self.df[self.df["status"] == "ok"]
        if metric not in ok.columns:
            raise ValueError(f"Metric {metric!r} not found in results.")

        group_cols = ([hue, by] if hue is not None and hue in ok.columns else [by])
        effective_hue = hue if hue in group_cols else None

        agg = ok.groupby(group_cols)[metric].agg(["mean", "std"]).reset_index()

        if ax is None:
            fig, ax = plt.subplots(figsize=(max(5, len(agg) * 0.9), 4))
        else:
            fig = ax.get_figure()

        by_vals = sorted(agg[by].unique())
        colors = plt.cm.tab10.colors  # type: ignore[attr-defined]

        if effective_hue is not None:
            hue_vals = sorted(agg[effective_hue].unique())
            n_hue = len(hue_vals)
            width = 0.75 / n_hue
            for i, hue_val in enumerate(hue_vals):
                sub = agg[agg[effective_hue] == hue_val]
                x_map = {v: j for j, v in enumerate(by_vals)}
                x_pos = [x_map[v] + (i - n_hue / 2 + 0.5) * width for v in sub[by]]
                ax.bar(
                    x_pos,
                    sub["mean"],
                    width=width,
                    label=str(hue_val),
                    yerr=sub["std"].fillna(0),
                    capsize=3,
                    color=colors[i % len(colors)],
                    alpha=0.85,
                )
            ax.legend(title=effective_hue, bbox_to_anchor=(1.01, 1), loc="upper left")
        else:
            means = [agg.loc[agg[by] == v, "mean"].mean() for v in by_vals]
            stds = [agg.loc[agg[by] == v, "std"].mean() for v in by_vals]
            ax.bar(
                range(len(by_vals)),
                means,
                yerr=[s if pd.notna(s) else 0 for s in stds],
                capsize=3,
                color=colors[0],
                alpha=0.85,
            )

        ax.set_xticks(range(len(by_vals)))
        ax.set_xticklabels(by_vals, rotation=15, ha="right")
        ax.set_xlabel(by)
        ax.set_ylabel(metric)
        title = f"{metric} by {by}"
        if effective_hue:
            title += f", coloured by {effective_hue}"
        ax.set_title(title)
        fig.tight_layout()
        return fig

    # ------------------------------------------------------------------
    # I/O
    # ------------------------------------------------------------------

    def save(self, path: str | Path) -> None:
        """Save results to a CSV or Parquet file.

        The format is inferred from the file extension (``.parquet`` → Parquet,
        anything else → CSV).

        Parameters
        ----------
        path : str or Path
            Destination path.  Parent directories are created automatically.

        Examples
        --------
        >>> result.save("results/comparison.csv")  # doctest: +SKIP
        """
        p = Path(path)
        p.parent.mkdir(parents=True, exist_ok=True)
        if p.suffix == ".parquet":
            self.df.to_parquet(p, index=False)
        else:
            self.df.to_csv(p, index=False)

    @classmethod
    def load(cls, path: str | Path) -> ComparisonResult:
        """Load results previously saved with :meth:`save`.

        Parameters
        ----------
        path : str or Path
            Path to a CSV or Parquet file created by :meth:`save`.

        Returns
        -------
        ComparisonResult

        Examples
        --------
        >>> result = ComparisonResult.load("results/comparison.csv")  # doctest: +SKIP
        """
        p = Path(path)
        if p.suffix == ".parquet":
            return cls(df=pd.read_parquet(p))
        return cls(df=pd.read_csv(p))

    # ------------------------------------------------------------------
    # Display
    # ------------------------------------------------------------------

    def __repr__(self) -> str:
        n_total = len(self.df)
        n_ok = int((self.df["status"] == "ok").sum()) if "status" in self.df.columns else n_total
        n_err = n_total - n_ok
        metrics = [m for m in _DEFAULT_METRICS if m in self.df.columns]
        return (
            f"ComparisonResult(rows={n_total}, ok={n_ok}, errors={n_err}, "
            f"metrics={metrics})"
        )

    def _repr_html_(self) -> str:
        return self.df._repr_html_()  # type: ignore[return-value]

Functions

`summary(*, by=None, metrics=None)`

Return mean metrics grouped by method and graph condition.

Only rows with status == "ok" are included.

Parameters:

Name	Type	Description	Default
`by`	`list of str`	Columns to group by. Default is `["graph_label", "method"]`.	`None`
`metrics`	`list of str`	Metric columns to aggregate. Default is `["cpd", "knn"]`.	`None`

Returns:

Type	Description
`DataFrame`	Mean of each metric for each group. Groups with no successful rows are absent.

Examples:

>>> result.summary()
                                 cpd    knn
graph_label  method
mode=knn__k=4  landmark_mds  0.8821  0.7341
               mds           0.7512  0.6103

Source code in src/spatial_graph_algorithms/compare/__init__.py

def summary(
    self,
    *,
    by: list[str] | None = None,
    metrics: list[str] | None = None,
) -> pd.DataFrame:
    """Return mean metrics grouped by method and graph condition.

    Only rows with ``status == "ok"`` are included.

    Parameters
    ----------
    by : list of str, optional
        Columns to group by.  Default is ``["graph_label", "method"]``.
    metrics : list of str, optional
        Metric columns to aggregate.  Default is ``["cpd", "knn"]``.

    Returns
    -------
    pandas.DataFrame
        Mean of each metric for each group.  Groups with no successful
        rows are absent.

    Examples
    --------
    >>> result.summary()  # doctest: +SKIP
                                     cpd    knn
    graph_label  method
    mode=knn__k=4  landmark_mds  0.8821  0.7341
                   mds           0.7512  0.6103
    """
    by = list(by) if by is not None else _DEFAULT_SUMMARY_BY
    metrics = list(metrics) if metrics is not None else _DEFAULT_METRICS
    ok = self.df[self.df["status"] == "ok"]
    present = [m for m in metrics if m in ok.columns]
    if not present:
        return pd.DataFrame()
    return ok.groupby(by)[present].mean().round(4)

`best(*, metric='cpd', by=None, higher_is_better=True)`

Return the best-performing method per group.

Parameters:

Name	Type	Description	Default
`metric`	`str`	Metric column to rank by. Default is `"cpd"`.	`'cpd'`
`by`	`list of str`	Grouping columns. Default is `["graph_label"]`.	`None`
`higher_is_better`	`bool`	If `True` (default), select the row with the highest metric value. Set to `False` for error / loss metrics.	`True`

Returns:

Type	Description
`DataFrame`	One row per unique `by` group, showing the best method and its mean metric value (averaged over seeds).

Raises:

Type	Description
`ValueError`	If `metric` is not a column in the results DataFrame.

Examples:

>>> result.best(metric="cpd")
      graph_label        method     cpd
0  mode=knn__k=4  landmark_mds  0.8821

Source code in src/spatial_graph_algorithms/compare/__init__.py

def best(
    self,
    *,
    metric: str = "cpd",
    by: list[str] | None = None,
    higher_is_better: bool = True,
) -> pd.DataFrame:
    """Return the best-performing method per group.

    Parameters
    ----------
    metric : str
        Metric column to rank by.  Default is ``"cpd"``.
    by : list of str, optional
        Grouping columns.  Default is ``["graph_label"]``.
    higher_is_better : bool
        If ``True`` (default), select the row with the highest metric
        value.  Set to ``False`` for error / loss metrics.

    Returns
    -------
    pandas.DataFrame
        One row per unique ``by`` group, showing the best method and its
        mean metric value (averaged over seeds).

    Raises
    ------
    ValueError
        If ``metric`` is not a column in the results DataFrame.

    Examples
    --------
    >>> result.best(metric="cpd")  # doctest: +SKIP
          graph_label        method     cpd
    0  mode=knn__k=4  landmark_mds  0.8821
    """
    by_cols = list(by) if by is not None else _DEFAULT_BEST_BY
    ok = self.df[self.df["status"] == "ok"]
    if metric not in ok.columns:
        available = [c for c in ok.columns if ok[c].dtype.kind == "f"]
        raise ValueError(
            f"Metric {metric!r} not found in results.  "
            f"Available numeric columns: {available}"
        )
    grp_cols = by_cols + ["method"]
    agg = ok.groupby(grp_cols)[metric].mean().reset_index()
    if higher_is_better:
        idx = agg.groupby(by_cols)[metric].idxmax()
    else:
        idx = agg.groupby(by_cols)[metric].idxmin()
    return agg.loc[idx.values].reset_index(drop=True)

`plot(*, metric='cpd', by='method', hue='graph_label', ax=None)`

Bar chart of a quality metric grouped by method and condition.

Means are computed over all successful rows (status == "ok"). Error bars show one standard deviation.

Parameters:

Name	Type	Description	Default
`metric`	`str`	Metric column to plot. Default is `"cpd"`.	`'cpd'`
`by`	`str`	Column that defines the x-axis categories. Default is `"method"`.	`'method'`
`hue`	`str`	Column that defines the colour grouping. Default is `"graph_label"`. Pass `None` for a single-colour chart.	`'graph_label'`
`ax`	`Axes`	Axes to draw on. A new figure is created when omitted.	`None`

Returns:

Type	Description
`Figure`

Raises:

Type	Description
`ValueError`	If `metric` is not a column in the results DataFrame.

Examples:

>>> fig = result.plot(metric="cpd", by="method")

Source code in src/spatial_graph_algorithms/compare/__init__.py

def plot(
    self,
    *,
    metric: str = "cpd",
    by: str = "method",
    hue: str | None = "graph_label",
    ax: plt.Axes | None = None,
) -> plt.Figure:
    """Bar chart of a quality metric grouped by method and condition.

    Means are computed over all successful rows (``status == "ok"``).
    Error bars show one standard deviation.

    Parameters
    ----------
    metric : str
        Metric column to plot.  Default is ``"cpd"``.
    by : str
        Column that defines the x-axis categories.  Default is
        ``"method"``.
    hue : str, optional
        Column that defines the colour grouping.  Default is
        ``"graph_label"``.  Pass ``None`` for a single-colour chart.
    ax : matplotlib.axes.Axes, optional
        Axes to draw on.  A new figure is created when omitted.

    Returns
    -------
    matplotlib.figure.Figure

    Raises
    ------
    ValueError
        If ``metric`` is not a column in the results DataFrame.

    Examples
    --------
    >>> fig = result.plot(metric="cpd", by="method")  # doctest: +SKIP
    """
    import matplotlib.pyplot as plt

    ok = self.df[self.df["status"] == "ok"]
    if metric not in ok.columns:
        raise ValueError(f"Metric {metric!r} not found in results.")

    group_cols = ([hue, by] if hue is not None and hue in ok.columns else [by])
    effective_hue = hue if hue in group_cols else None

    agg = ok.groupby(group_cols)[metric].agg(["mean", "std"]).reset_index()

    if ax is None:
        fig, ax = plt.subplots(figsize=(max(5, len(agg) * 0.9), 4))
    else:
        fig = ax.get_figure()

    by_vals = sorted(agg[by].unique())
    colors = plt.cm.tab10.colors  # type: ignore[attr-defined]

    if effective_hue is not None:
        hue_vals = sorted(agg[effective_hue].unique())
        n_hue = len(hue_vals)
        width = 0.75 / n_hue
        for i, hue_val in enumerate(hue_vals):
            sub = agg[agg[effective_hue] == hue_val]
            x_map = {v: j for j, v in enumerate(by_vals)}
            x_pos = [x_map[v] + (i - n_hue / 2 + 0.5) * width for v in sub[by]]
            ax.bar(
                x_pos,
                sub["mean"],
                width=width,
                label=str(hue_val),
                yerr=sub["std"].fillna(0),
                capsize=3,
                color=colors[i % len(colors)],
                alpha=0.85,
            )
        ax.legend(title=effective_hue, bbox_to_anchor=(1.01, 1), loc="upper left")
    else:
        means = [agg.loc[agg[by] == v, "mean"].mean() for v in by_vals]
        stds = [agg.loc[agg[by] == v, "std"].mean() for v in by_vals]
        ax.bar(
            range(len(by_vals)),
            means,
            yerr=[s if pd.notna(s) else 0 for s in stds],
            capsize=3,
            color=colors[0],
            alpha=0.85,
        )

    ax.set_xticks(range(len(by_vals)))
    ax.set_xticklabels(by_vals, rotation=15, ha="right")
    ax.set_xlabel(by)
    ax.set_ylabel(metric)
    title = f"{metric} by {by}"
    if effective_hue:
        title += f", coloured by {effective_hue}"
    ax.set_title(title)
    fig.tight_layout()
    return fig

`save(path)`

Save results to a CSV or Parquet file.

The format is inferred from the file extension (.parquet → Parquet, anything else → CSV).

Parameters:

Name	Type	Description	Default
`path`	`str or Path`	Destination path. Parent directories are created automatically.	required

Examples:

>>> result.save("results/comparison.csv")

Source code in src/spatial_graph_algorithms/compare/__init__.py

def save(self, path: str | Path) -> None:
    """Save results to a CSV or Parquet file.

    The format is inferred from the file extension (``.parquet`` → Parquet,
    anything else → CSV).

    Parameters
    ----------
    path : str or Path
        Destination path.  Parent directories are created automatically.

    Examples
    --------
    >>> result.save("results/comparison.csv")  # doctest: +SKIP
    """
    p = Path(path)
    p.parent.mkdir(parents=True, exist_ok=True)
    if p.suffix == ".parquet":
        self.df.to_parquet(p, index=False)
    else:
        self.df.to_csv(p, index=False)

`load(path)` `classmethod`

Load results previously saved with :meth:save.

Parameters:

Name	Type	Description	Default
`path`	`str or Path`	Path to a CSV or Parquet file created by :meth:`save`.	required

Returns:

Type	Description
`ComparisonResult`

Examples:

>>> result = ComparisonResult.load("results/comparison.csv")

Source code in src/spatial_graph_algorithms/compare/__init__.py

@classmethod
def load(cls, path: str | Path) -> ComparisonResult:
    """Load results previously saved with :meth:`save`.

    Parameters
    ----------
    path : str or Path
        Path to a CSV or Parquet file created by :meth:`save`.

    Returns
    -------
    ComparisonResult

    Examples
    --------
    >>> result = ComparisonResult.load("results/comparison.csv")  # doctest: +SKIP
    """
    p = Path(path)
    if p.suffix == ".parquet":
        return cls(df=pd.read_parquet(p))
    return cls(df=pd.read_csv(p))