Skip to content

climate_ref_esmvaltool.recipe #

as_facets(group) #

Convert a group from the datasets dataframe to ESMValTool facets.

Parameters:

Name Type Description Default
group DataFrame

A group of datasets representing a single instance_id.

required

Returns:

Type Description
A :obj:`dict` containing facet-value pairs.
Source code in packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py
def as_facets(
    group: pd.DataFrame,
) -> dict[str, Any]:
    """Convert a group from the datasets dataframe to ESMValTool facets.

    Parameters
    ----------
    group:
        A group of datasets representing a single instance_id.

    Returns
    -------
        A :obj:`dict` containing facet-value pairs.

    """
    facets = {}
    instance_parts = group.iloc[0].instance_id.split(".")
    project = instance_parts[0]
    facets["project"] = project
    for esmvaltool_name, ref_name in FACETS[project].items():
        if esmvaltool_name == "activity":
            # Derive activity from instance_id to match the directory structure
            # created by prepare_climate_data(). The activity_id column can
            # contain space-separated values (e.g. "C4MIP CDRMIP") but the
            # instance_id always uses only the primary activity.
            activities = group["instance_id"].apply(lambda x: x.split(".")[1]).unique().tolist()
            facets[esmvaltool_name] = activities if len(activities) > 1 else activities[0]
        else:
            values = group[ref_name].unique().tolist()
            facets[esmvaltool_name] = values if len(values) > 1 else values[0]
    timerange = as_timerange(group)
    if timerange is not None:
        facets["timerange"] = timerange
    return facets

as_isodate(timestamp) #

Format a timestamp as an ISO 8601 datetime.

For example, '2014-12-16 12:00:00' will be formatted as '20141216T120000'.

Parameters:

Name Type Description Default
timestamp Timestamp

The timestamp to format.

required
Source code in packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py
def as_isodate(timestamp: pd.Timestamp) -> str:
    """Format a timestamp as an ISO 8601 datetime.

    For example, '2014-12-16 12:00:00' will be formatted as '20141216T120000'.

    Parameters
    ----------
    timestamp
        The timestamp to format.

    """
    return str(timestamp).replace(" ", "T").replace("-", "").replace(":", "")

as_timerange(group) #

Format the timeranges from a dataframe as an ESMValTool timerange.

Parameters:

Name Type Description Default
group DataFrame

The dataframe describing a single dataset.

required

Returns:

Type Description
A timerange.
Source code in packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py
def as_timerange(group: pd.DataFrame) -> str | None:
    """Format the timeranges from a dataframe as an ESMValTool timerange.

    Parameters
    ----------
    group
        The dataframe describing a single dataset.

    Returns
    -------
        A timerange.
    """
    # TODO: apply some rounding to avoid problems?
    # https://github.com/ESMValGroup/ESMValCore/issues/2048
    start_times = group.start_time.dropna()
    if start_times.empty:
        return None
    end_times = group.end_time.dropna()
    if end_times.empty:
        return None  # pragma: no cover
    return f"{as_isodate(start_times.min())}/{as_isodate(end_times.max())}"

dataframe_to_recipe(files, group_by=('instance_id',), equalize_timerange=False) #

Convert the datasets dataframe to a recipe "variables" section.

Parameters:

Name Type Description Default
files DataFrame

The pandas dataframe describing the input files.

required
group_by tuple[str, ...]

The columns to group the input files by.

('instance_id',)
equalize_timerange bool

If True, use the timerange that is covered by all datasets.

False

Returns:

Type Description
A "variables" section that can be used in an ESMValTool recipe.
Source code in packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py
def dataframe_to_recipe(
    files: pd.DataFrame,
    group_by: tuple[str, ...] = ("instance_id",),
    equalize_timerange: bool = False,
) -> dict[str, Any]:
    """Convert the datasets dataframe to a recipe "variables" section.

    Parameters
    ----------
    files
        The pandas dataframe describing the input files.
    group_by
        The columns to group the input files by.
    equalize_timerange
        If True, use the timerange that is covered by all datasets.

    Returns
    -------
        A "variables" section that can be used in an ESMValTool recipe.
    """
    variables: dict[str, Any] = {}
    for _, group in files.groupby(list(group_by)):
        facets = as_facets(group)
        short_name = facets.pop("short_name")
        if short_name not in variables:
            variables[short_name] = {"additional_datasets": []}
        variables[short_name]["additional_datasets"].append(facets)

    if equalize_timerange:
        # Select a timerange covered by all datasets.
        start_times, end_times = [], []
        for variable in variables.values():
            for dataset in variable["additional_datasets"]:
                if "timerange" in dataset:
                    start, end = dataset["timerange"].split("/")
                    start_times.append(start)
                    end_times.append(end)
        timerange = f"{max(start_times)}/{min(end_times)}"
        for variable in variables.values():
            for dataset in variable["additional_datasets"]:
                if "timerange" in dataset:
                    dataset["timerange"] = timerange

    return variables

fix_annual_statistics_keep_year(recipe) #

Add keep_group_coordinates: true to every annual_statistics step.

ESMValCore changed annual_statistics to remove the year coordinate by default (keep_group_coordinates=False). Several ESMValTool diagnostic scripts still rely on the coordinate being present, so we patch the recipe to preserve it.

Remove this workaround once ESMValCore restores the old default or the affected diagnostic scripts are updated.

Parameters:

Name Type Description Default
recipe Recipe

The recipe to update in place.

required
Source code in packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py
def fix_annual_statistics_keep_year(recipe: Recipe) -> None:
    """Add ``keep_group_coordinates: true`` to every ``annual_statistics`` step.

    ESMValCore changed ``annual_statistics`` to remove the ``year``
    coordinate by default (``keep_group_coordinates=False``).  Several
    ESMValTool diagnostic scripts still rely on the coordinate being
    present, so we patch the recipe to preserve it.

    Remove this workaround once ESMValCore restores the old default or
    the affected diagnostic scripts are updated.

    Parameters
    ----------
    recipe
        The recipe to update in place.
    """
    for preprocessor in recipe.get("preprocessors", {}).values():
        if isinstance(preprocessor, dict) and "annual_statistics" in preprocessor:
            annual = preprocessor["annual_statistics"]
            if isinstance(annual, dict):
                annual.setdefault("keep_group_coordinates", True)
            else:
                preprocessor["annual_statistics"] = {
                    "operator": annual if isinstance(annual, str) else "mean",
                    "keep_group_coordinates": True,
                }

get_child_and_parent_dataset(df, parent_experiment, child_duration_in_years, parent_offset_in_years, parent_duration_in_years) #

Retrieve the child and parent dataset in recipe format from a dataframe.

Source code in packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py
def get_child_and_parent_dataset(
    df: pd.DataFrame,
    parent_experiment: str,
    child_duration_in_years: int,
    parent_offset_in_years: int,
    parent_duration_in_years: int,
) -> list[dict[str, str | list[str]]]:
    """Retrieve the child and parent dataset in recipe format from a dataframe."""
    parent_df = df[(df.experiment_id == parent_experiment)]
    child_df = df[(df.experiment_id != parent_experiment)]

    if parent_df.empty:  # pragma: no branch
        raise ValueError(f"No dataset found for parent experiment '{parent_experiment}'")
    if child_df.empty:  # pragma: no branch
        raise ValueError(f"No dataset found for child experiment (not '{parent_experiment}')")

    # Compute the start time of the child and parent datasets using the
    # branch_time_in_parent and branch_time_in_child attributes to compute the offset.
    # This ensures that the datasets are aligned correctly in time.
    parent_attrs = parent_df.iloc[0]
    child_attrs = child_df.iloc[0]
    branch_time_in_parent = cftime.num2date(
        child_attrs["branch_time_in_parent"],
        units=parent_attrs["time_units"],
        calendar=parent_attrs["calendar"],
    )
    branch_time_in_child = cftime.num2date(
        child_attrs["branch_time_in_child"],
        units=child_attrs["time_units"],
        calendar=child_attrs["calendar"],
    )
    child_start = child_df["start_time"].dropna().min()
    parent_start = child_start + (branch_time_in_parent - branch_time_in_child)

    # Create the datasets for use in the recipe.
    var_name = child_attrs["variable_id"]
    child_dataset = dataframe_to_recipe(child_df)[var_name]["additional_datasets"][0]
    # The end year of the timerange is inclusive, so subtract 1.
    child_end_year = child_start.year + child_duration_in_years - 1
    child_dataset["timerange"] = f"{child_start.year:04d}/{child_end_year:04d}"

    parent_dataset = dataframe_to_recipe(parent_df)[var_name]["additional_datasets"][0]
    parent_start_year = parent_start.year + parent_offset_in_years
    parent_end_year = parent_start_year + parent_duration_in_years - 1
    parent_dataset["timerange"] = f"{parent_start_year:04d}/{parent_end_year:04d}"

    return [child_dataset, parent_dataset]

load_recipe(recipe) #

Load a recipe.

Parameters:

Name Type Description Default
recipe str

The name of an ESMValTool recipe.

required

Returns:

Type Description
The loaded recipe.
Source code in packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py
def load_recipe(recipe: str) -> Recipe:
    """Load a recipe.

    Parameters
    ----------
    recipe
        The name of an ESMValTool recipe.

    Returns
    -------
        The loaded recipe.
    """
    filename = dataset_registry_manager[_RECIPES_REGISTRY_NAME].fetch(recipe)

    def normalize(obj: Any) -> Any:
        # Ensure objects in the recipe are not shared.
        if isinstance(obj, dict):
            return {k: normalize(v) for k, v in obj.items()}
        if isinstance(obj, list):
            return [normalize(item) for item in obj]
        return obj

    return normalize(yaml.safe_load(Path(filename).read_text(encoding="utf-8")))  # type: ignore[no-any-return]

prepare_climate_data(datasets, climate_data_dir) #

Symlink the input files from the Pandas dataframe into a directory tree.

This ensures that ESMValTool can find the data and only uses the requested data.

Parameters:

Name Type Description Default
datasets DataFrame

The pandas dataframe describing the input datasets.

required
climate_data_dir Path

The directory where ESMValTool should look for input data.

required
Source code in packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py
def prepare_climate_data(datasets: pd.DataFrame, climate_data_dir: Path) -> None:
    """Symlink the input files from the Pandas dataframe into a directory tree.

    This ensures that ESMValTool can find the data and only uses the
    requested data.

    Parameters
    ----------
    datasets
        The pandas dataframe describing the input datasets.
    climate_data_dir
        The directory where ESMValTool should look for input data.
    """
    # Track which directories we've already cleaned to avoid redundant work
    cleaned_dirs: set[Path] = set()

    for row in datasets.itertuples():
        if not isinstance(row.instance_id, str):  # pragma: no branch
            msg = f"Invalid instance_id encountered in {row}"
            raise ValueError(msg)
        if not isinstance(row.path, str):  # pragma: no branch
            msg = f"Invalid path encountered in {row}"
            raise ValueError(msg)
        if row.instance_id.startswith("obs4MIPs."):
            version = row.instance_id.split(".")[-1]
            subdirs: list[str] = ["obs4MIPs", row.source_id, version]  # type: ignore[list-item]
        elif row.instance_id.startswith("CMIP7."):
            subdirs = row.instance_id.split(".")
        else:
            subdirs = row.instance_id.split(".")
        tgt = climate_data_dir.joinpath(*subdirs) / Path(row.path).name
        tgt.parent.mkdir(parents=True, exist_ok=True)

        # Remove any stale symlinks in the target directory to prevent
        # ESMValCore from finding dangling symlinks from previous runs
        if tgt.parent not in cleaned_dirs:
            for existing in tgt.parent.iterdir():
                if existing.is_symlink() and not existing.resolve().exists():
                    existing.unlink()
            cleaned_dirs.add(tgt.parent)

        if tgt.is_symlink() or tgt.exists():
            tgt.unlink()
        tgt.symlink_to(row.path)

rewrite_mip_for_cmip7(recipe) #

Rewrite CMIP6 MIP table names to CMIP7 realm names in a recipe.

Base ESMValTool recipes have CMIP6 MIP table names (e.g. Amon, Lmon) hardcoded in the diagnostics/variables section. When the recipe uses CMIP7 data, these must be rewritten to CMIP7 realm names (e.g. atmos, land).

Parameters:

Name Type Description Default
recipe Recipe

The recipe to update in place.

required
Source code in packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py
def rewrite_mip_for_cmip7(recipe: Recipe) -> None:
    """Rewrite CMIP6 MIP table names to CMIP7 realm names in a recipe.

    Base ESMValTool recipes have CMIP6 MIP table names (e.g. ``Amon``,
    ``Lmon``) hardcoded in the diagnostics/variables section. When the recipe
    uses CMIP7 data, these must be rewritten to CMIP7 realm names
    (e.g. ``atmos``, ``land``).

    Parameters
    ----------
    recipe
        The recipe to update in place.
    """
    if not any(ds.get("project") == "CMIP7" for ds in _iter_recipe_datasets(recipe)):
        return

    for diag in recipe.get("diagnostics", {}).values():
        for var_settings in diag.get("variables", {}).values():
            if isinstance(var_settings, dict):
                _rewrite_variable_mip(var_settings)