`climate_ref_esmvaltool.recipe` #

`as_facets(group)` #

Convert a group from the datasets dataframe to ESMValTool facets.

Parameters:

Name	Type	Description	Default
`group`	`DataFrame`	A group of datasets representing a single instance_id.	required

Returns:

Type	Description
A :obj:`dict` containing facet-value pairs.

Source code in packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py

def as_facets(
    group: pd.DataFrame,
) -> dict[str, Any]:
    """Convert a group from the datasets dataframe to ESMValTool facets.

    Parameters
    ----------
    group:
        A group of datasets representing a single instance_id.

    Returns
    -------
        A :obj:`dict` containing facet-value pairs.

    """
    facets = {}
    instance_parts = group.iloc[0].instance_id.split(".")
    project = instance_parts[0]
    facets["project"] = project
    for esmvaltool_name, ref_name in FACETS[project].items():
        if esmvaltool_name == "activity":
            # Derive activity from instance_id to match the directory structure
            # created by prepare_climate_data(). The activity_id column can
            # contain space-separated values (e.g. "C4MIP CDRMIP") but the
            # instance_id always uses only the primary activity.
            activities = group["instance_id"].apply(lambda x: x.split(".")[1]).unique().tolist()
            facets[esmvaltool_name] = activities if len(activities) > 1 else activities[0]
        else:
            values = group[ref_name].unique().tolist()
            facets[esmvaltool_name] = values if len(values) > 1 else values[0]
    timerange = as_timerange(group)
    if timerange is not None:
        facets["timerange"] = timerange
    return facets

`as_isodate(timestamp)` #

Format a timestamp as an ISO 8601 datetime.

For example, '2014-12-16 12:00:00' will be formatted as '20141216T120000'.

Parameters:

Name	Type	Description	Default
`timestamp`	`Timestamp`	The timestamp to format.	required

Source code in packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py

def as_isodate(timestamp: pd.Timestamp) -> str:
    """Format a timestamp as an ISO 8601 datetime.

    For example, '2014-12-16 12:00:00' will be formatted as '20141216T120000'.

    Parameters
    ----------
    timestamp
        The timestamp to format.

    """
    return str(timestamp).replace(" ", "T").replace("-", "").replace(":", "")

`as_timerange(group)` #

Format the timeranges from a dataframe as an ESMValTool timerange.

Parameters:

Name	Type	Description	Default
`group`	`DataFrame`	The dataframe describing a single dataset.	required

Returns:

Type	Description
`A timerange.`

Source code in packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py

def as_timerange(group: pd.DataFrame) -> str | None:
    """Format the timeranges from a dataframe as an ESMValTool timerange.

    Parameters
    ----------
    group
        The dataframe describing a single dataset.

    Returns
    -------
        A timerange.
    """
    # TODO: apply some rounding to avoid problems?
    # https://github.com/ESMValGroup/ESMValCore/issues/2048
    start_times = group.start_time.dropna()
    if start_times.empty:
        return None
    end_times = group.end_time.dropna()
    if end_times.empty:
        return None  # pragma: no cover
    return f"{as_isodate(start_times.min())}/{as_isodate(end_times.max())}"

`dataframe_to_recipe(files, group_by=('instance_id',), equalize_timerange=False)` #

Convert the datasets dataframe to a recipe "variables" section.

Parameters:

Name	Type	Description	Default
`files`	`DataFrame`	The pandas dataframe describing the input files.	required
`group_by`	`tuple[str, ...]`	The columns to group the input files by.	`('instance_id',)`
`equalize_timerange`	`bool`	If True, use the timerange that is covered by all datasets.	`False`

Returns:

Type	Description
`A "variables" section that can be used in an ESMValTool recipe.`

Source code in packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py

def dataframe_to_recipe(
    files: pd.DataFrame,
    group_by: tuple[str, ...] = ("instance_id",),
    equalize_timerange: bool = False,
) -> dict[str, Any]:
    """Convert the datasets dataframe to a recipe "variables" section.

    Parameters
    ----------
    files
        The pandas dataframe describing the input files.
    group_by
        The columns to group the input files by.
    equalize_timerange
        If True, use the timerange that is covered by all datasets.

    Returns
    -------
        A "variables" section that can be used in an ESMValTool recipe.
    """
    variables: dict[str, Any] = {}
    for _, group in files.groupby(list(group_by)):
        facets = as_facets(group)
        short_name = facets.pop("short_name")
        if short_name not in variables:
            variables[short_name] = {"additional_datasets": []}
        variables[short_name]["additional_datasets"].append(facets)

    if equalize_timerange:
        # Select a timerange covered by all datasets.
        start_times, end_times = [], []
        for variable in variables.values():
            for dataset in variable["additional_datasets"]:
                if "timerange" in dataset:
                    start, end = dataset["timerange"].split("/")
                    start_times.append(start)
                    end_times.append(end)
        timerange = f"{max(start_times)}/{min(end_times)}"
        for variable in variables.values():
            for dataset in variable["additional_datasets"]:
                if "timerange" in dataset:
                    dataset["timerange"] = timerange

    return variables

`fix_annual_statistics_keep_year(recipe)` #

Add keep_group_coordinates: true to every annual_statistics step.

ESMValCore changed annual_statistics to remove the year coordinate by default (keep_group_coordinates=False). Several ESMValTool diagnostic scripts still rely on the coordinate being present, so we patch the recipe to preserve it.

Remove this workaround once ESMValCore restores the old default or the affected diagnostic scripts are updated.

Parameters:

Name	Type	Description	Default
`recipe`	`Recipe`	The recipe to update in place.	required

Source code in packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py

def fix_annual_statistics_keep_year(recipe: Recipe) -> None:
    """Add ``keep_group_coordinates: true`` to every ``annual_statistics`` step.

    ESMValCore changed ``annual_statistics`` to remove the ``year``
    coordinate by default (``keep_group_coordinates=False``).  Several
    ESMValTool diagnostic scripts still rely on the coordinate being
    present, so we patch the recipe to preserve it.

    Remove this workaround once ESMValCore restores the old default or
    the affected diagnostic scripts are updated.

    Parameters
    ----------
    recipe
        The recipe to update in place.
    """
    for preprocessor in recipe.get("preprocessors", {}).values():
        if isinstance(preprocessor, dict) and "annual_statistics" in preprocessor:
            annual = preprocessor["annual_statistics"]
            if isinstance(annual, dict):
                annual.setdefault("keep_group_coordinates", True)
            else:
                preprocessor["annual_statistics"] = {
                    "operator": annual if isinstance(annual, str) else "mean",
                    "keep_group_coordinates": True,
                }

`get_child_and_parent_dataset(df, parent_experiment, child_duration_in_years, parent_offset_in_years, parent_duration_in_years)` #

Retrieve the child and parent dataset in recipe format from a dataframe.

Source code in packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py

def get_child_and_parent_dataset(
    df: pd.DataFrame,
    parent_experiment: str,
    child_duration_in_years: int,
    parent_offset_in_years: int,
    parent_duration_in_years: int,
) -> list[dict[str, str | list[str]]]:
    """Retrieve the child and parent dataset in recipe format from a dataframe."""
    parent_df = df[(df.experiment_id == parent_experiment)]
    child_df = df[(df.experiment_id != parent_experiment)]

    if parent_df.empty:  # pragma: no branch
        raise ValueError(f"No dataset found for parent experiment '{parent_experiment}'")
    if child_df.empty:  # pragma: no branch
        raise ValueError(f"No dataset found for child experiment (not '{parent_experiment}')")

    # Compute the start time of the child and parent datasets using the
    # branch_time_in_parent and branch_time_in_child attributes to compute the offset.
    # This ensures that the datasets are aligned correctly in time.
    parent_attrs = parent_df.iloc[0]
    child_attrs = child_df.iloc[0]
    branch_time_in_parent = cftime.num2date(
        child_attrs["branch_time_in_parent"],
        units=parent_attrs["time_units"],
        calendar=parent_attrs["calendar"],
    )
    branch_time_in_child = cftime.num2date(
        child_attrs["branch_time_in_child"],
        units=child_attrs["time_units"],
        calendar=child_attrs["calendar"],
    )
    child_start = child_df["start_time"].dropna().min()
    if isinstance(child_start, str):
        # Catalogs serialised to YAML store start_time as an ISO-like string
        # but the database returns CFDatetime objects.
        child_start = cftime.datetime.strptime(
            child_start, "%Y-%m-%d %H:%M:%S", calendar=child_attrs["calendar"]
        )
    parent_start = child_start + (branch_time_in_parent - branch_time_in_child)

    # Create the datasets for use in the recipe.
    var_name = child_attrs["variable_id"]
    child_dataset = dataframe_to_recipe(child_df)[var_name]["additional_datasets"][0]
    # The end year of the timerange is inclusive, so subtract 1.
    child_end_year = child_start.year + child_duration_in_years - 1
    child_dataset["timerange"] = f"{child_start.year:04d}/{child_end_year:04d}"

    parent_dataset = dataframe_to_recipe(parent_df)[var_name]["additional_datasets"][0]
    parent_start_year = parent_start.year + parent_offset_in_years
    parent_end_year = parent_start_year + parent_duration_in_years - 1
    parent_dataset["timerange"] = f"{parent_start_year:04d}/{parent_end_year:04d}"

    return [child_dataset, parent_dataset]

`load_recipe(recipe)` #

Load a recipe.

Parameters:

Name	Type	Description	Default
`recipe`	`str`	The name of an ESMValTool recipe.	required

Returns:

Type	Description
`The loaded recipe.`

Source code in packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py

def load_recipe(recipe: str) -> Recipe:
    """Load a recipe.

    Parameters
    ----------
    recipe
        The name of an ESMValTool recipe.

    Returns
    -------
        The loaded recipe.
    """
    filename = dataset_registry_manager[_RECIPES_REGISTRY_NAME].fetch(recipe)

    def normalize(obj: Any) -> Any:
        # Ensure objects in the recipe are not shared.
        if isinstance(obj, dict):
            return {k: normalize(v) for k, v in obj.items()}
        if isinstance(obj, list):
            return [normalize(item) for item in obj]
        return obj

    return normalize(yaml.safe_load(Path(filename).read_text(encoding="utf-8")))  # type: ignore[no-any-return]

`prepare_climate_data(datasets, climate_data_dir)` #

Symlink the input files from the Pandas dataframe into a directory tree.

This ensures that ESMValTool can find the data and only uses the requested data.

Parameters:

Name	Type	Description	Default
`datasets`	`DataFrame`	The pandas dataframe describing the input datasets.	required
`climate_data_dir`	`Path`	The directory where ESMValTool should look for input data.	required

Source code in packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py

def prepare_climate_data(datasets: pd.DataFrame, climate_data_dir: Path) -> None:
    """Symlink the input files from the Pandas dataframe into a directory tree.

    This ensures that ESMValTool can find the data and only uses the
    requested data.

    Parameters
    ----------
    datasets
        The pandas dataframe describing the input datasets.
    climate_data_dir
        The directory where ESMValTool should look for input data.
    """
    # Track which directories we've already cleaned to avoid redundant work
    cleaned_dirs: set[Path] = set()

    for row in datasets.itertuples():
        if not isinstance(row.instance_id, str):  # pragma: no branch
            msg = f"Invalid instance_id encountered in {row}"
            raise ValueError(msg)
        if not isinstance(row.path, str):  # pragma: no branch
            msg = f"Invalid path encountered in {row}"
            raise ValueError(msg)
        if row.instance_id.startswith("obs4MIPs."):
            version = row.instance_id.split(".")[-1]
            subdirs: list[str] = ["obs4MIPs", row.source_id, version]  # type: ignore[list-item]
        elif row.instance_id.startswith("CMIP7."):
            subdirs = row.instance_id.split(".")
        else:
            subdirs = row.instance_id.split(".")
        tgt = climate_data_dir.joinpath(*subdirs) / Path(row.path).name
        tgt.parent.mkdir(parents=True, exist_ok=True)

        # Remove any stale symlinks in the target directory to prevent
        # ESMValCore from finding dangling symlinks from previous runs
        if tgt.parent not in cleaned_dirs:
            for existing in tgt.parent.iterdir():
                if existing.is_symlink() and not existing.resolve().exists():
                    existing.unlink()
            cleaned_dirs.add(tgt.parent)

        if tgt.is_symlink() or tgt.exists():
            tgt.unlink()
        tgt.symlink_to(row.path)

`rewrite_mip_for_cmip7(recipe)` #

Rewrite CMIP6 MIP table names to CMIP7 realm names in a recipe.

Base ESMValTool recipes have CMIP6 MIP table names (e.g. Amon, Lmon) hardcoded in the diagnostics/variables section. When the recipe uses CMIP7 data, these must be rewritten to CMIP7 realm names (e.g. atmos, land).

Parameters:

Name	Type	Description	Default
`recipe`	`Recipe`	The recipe to update in place.	required

Source code in packages/climate-ref-esmvaltool/src/climate_ref_esmvaltool/recipe.py

def rewrite_mip_for_cmip7(recipe: Recipe) -> None:
    """Rewrite CMIP6 MIP table names to CMIP7 realm names in a recipe.

    Base ESMValTool recipes have CMIP6 MIP table names (e.g. ``Amon``,
    ``Lmon``) hardcoded in the diagnostics/variables section. When the recipe
    uses CMIP7 data, these must be rewritten to CMIP7 realm names
    (e.g. ``atmos``, ``land``).

    Parameters
    ----------
    recipe
        The recipe to update in place.
    """
    if not any(ds.get("project") == "CMIP7" for ds in _iter_recipe_datasets(recipe)):
        return

    for diag in recipe.get("diagnostics", {}).values():
        for var_settings in diag.get("variables", {}).values():
            if isinstance(var_settings, dict):
                _rewrite_variable_mip(var_settings)

climate_ref_esmvaltool.recipe #

as_facets(group) #

as_isodate(timestamp) #

as_timerange(group) #

dataframe_to_recipe(files, group_by=('instance_id',), equalize_timerange=False) #

fix_annual_statistics_keep_year(recipe) #

get_child_and_parent_dataset(df, parent_experiment, child_duration_in_years, parent_offset_in_years, parent_duration_in_years) #

load_recipe(recipe) #

prepare_climate_data(datasets, climate_data_dir) #

rewrite_mip_for_cmip7(recipe) #

`climate_ref_esmvaltool.recipe` #

`as_facets(group)` #

`as_isodate(timestamp)` #

`as_timerange(group)` #

`dataframe_to_recipe(files, group_by=('instance_id',), equalize_timerange=False)` #

`fix_annual_statistics_keep_year(recipe)` #

`get_child_and_parent_dataset(df, parent_experiment, child_duration_in_years, parent_offset_in_years, parent_duration_in_years)` #

`load_recipe(recipe)` #

`prepare_climate_data(datasets, climate_data_dir)` #

`rewrite_mip_for_cmip7(recipe)` #