DHI · jsmariegaard · Jan 8, 2024 · Jan 3, 2024 · Jan 3, 2024 · Jan 4, 2024
@@ -7,7 +7,6 @@ on:
   push:
     branches: [ main]
   pull_request:
-    branches: [ main ]
 
 jobs:
   build:

@@ -12,6 +12,7 @@
     Iterable,
     overload,
     Hashable,
+    Tuple,
 )
 import warnings
 import zipfile
@@ -82,17 +83,23 @@ def _all_df_template(n_quantities: int = 1):
 class ComparerCollection(Mapping, Scoreable):
     """
     Collection of comparers, constructed by calling the `modelskill.match`
-    method. Can be indexed by name or index.
+    method or by initializing with a list of comparers.
+
+    NOTE: In case of multiple model results with different time coverage,
+    only the _overlapping_ time period will be used! (intersection)
 
     Examples
     --------
     >>> import modelskill as ms
     >>> mr = ms.DfsuModelResult("Oresund2D.dfsu", item=0)
     >>> o1 = ms.PointObservation("klagshamn.dfs0", item=0, x=366844, y=6154291, name="Klagshamn")
     >>> o2 = ms.PointObservation("drogden.dfs0", item=0, x=355568.0, y=6156863.0)
-    >>> cc = ms.match(obs=[o1,o2], mod=mr)
-    >>> sk = cc.skill()
-    >>> cc["Klagshamn"].plot.timeseries()
+    >>> cmp1 = ms.match(o1, mr)  # Comparer
+    >>> cmp2 = ms.match(o2, mr)  # Comparer
+    >>> ccA = ms.ComparerCollection([cmp1, cmp2])
+    >>> ccB = ms.match(obs=[o1, o2], mod=mr)
+    >>> sk = ccB.skill()
+    >>> ccB["Klagshamn"].plot.timeseries()
     """
 
     plotter = ComparerCollectionPlotter
@@ -562,65 +569,80 @@ def skill(
         res = _groupby_df(df, by, pmetrics)
         mtr_cols = [m.__name__ for m in pmetrics]  # type: ignore
         res = res.dropna(subset=mtr_cols, how="all")  # TODO: ok to remove empty?
-        res["x"] = df.groupby(by=by, observed=False).x.first()
-        res["y"] = df.groupby(by=by, observed=False).y.first()
-        # TODO: set x,y to NaN if TrackObservation, x.nunique() > 1
+        res = self._append_xy_to_res(res, cc)
         res = cc._add_as_col_if_not_in_index(df, skilldf=res)
         return SkillTable(res)
 
-    def _to_long_dataframe(self, attrs_keys=None, observed=False) -> pd.DataFrame:
+    def _to_long_dataframe(
+        self, attrs_keys: Iterable[str] | None = None, observed: bool = False
+    ) -> pd.DataFrame:
         """Return a copy of the data as a long-format pandas DataFrame (for groupby operations)"""
         # TODO delegate to each comparer
         attrs_keys = attrs_keys or []
         res = _all_df_template(self.n_quantities)
         frames = []
-        cols = list(res.keys()) + attrs_keys
+
         for cmp in self._comparers.values():
-            for j in range(cmp.n_models):
-                mod_name = cmp.mod_names[j]
+            attrs = {key: cmp.data.attrs.get(key, False) for key in attrs_keys}
+            for mod_name in cmp.mod_names:
                 # drop "x", "y",  ?
-                df = cmp.data.drop_vars(["z"])[[mod_name]].to_dataframe().copy()
-                df = df.rename(columns={mod_name: "mod_val"})
-                df["model"] = mod_name
-                df["observation"] = cmp.name
+                df = (
+                    cmp.data[[mod_name]]
+                    .to_dataframe()
+                    .copy()
+                    .rename(columns={mod_name: "mod_val"})
+                    .assign(model=mod_name, observation=cmp.name, x=cmp.x, y=cmp.y)
+                    .assign(obs_val=cmp.data["Observation"].values)
+                    .assign(**attrs)
+                )
                 if self.n_quantities > 1:
                     df["quantity"] = cmp.quantity.name
-                df["x"] = cmp.x
-                df["y"] = cmp.y
-                df["obs_val"] = cmp.data["Observation"].values
-                for key in attrs_keys:
-                    if key in cmp.data.attrs:
-                        df[key] = cmp.data.attrs[key]
-                    else:
-                        df[key] = False
-                frames.append(df[cols])
-        if len(frames) > 0:
-            res = pd.concat(frames)
+
+                frames.append(df)
+        res = pd.concat(frames)
         cat_cols = res.select_dtypes(include=["object"]).columns
         res[cat_cols] = res[cat_cols].astype("category")
 
         if observed:
             res = res.loc[~(res == False).any(axis=1)]  # noqa
         res.index.name = "time"
+        # TODO change index to regular column, it is not unique
         return res
 
     @staticmethod
-    def _attrs_keys_in_by(by):
-        """Check if 'attrs:' is in by and return attrs_keys"""
+    def _attrs_keys_in_by(by: str | List[str]) -> Tuple[List[str], List[str]]:
+        """skill() helper: Check if 'attrs:' is in by and return attrs_keys"""
         attrs_keys = []
         by = [by] if isinstance(by, str) else by
         for j, b in enumerate(by):
             if isinstance(b, str) and b.startswith("attrs:"):
                 key = b.split(":")[1]
                 attrs_keys.append(key)
                 by[j] = key  # remove 'attrs:' prefix
-        attrs_keys = None if len(attrs_keys) == 0 else attrs_keys
         return by, attrs_keys
 
+    @staticmethod
+    def _append_xy_to_res(res: pd.DataFrame, cc: ComparerCollection) -> pd.DataFrame:
+        """skill() helper: Append x and y to res if possible"""
+        res["x"] = np.nan
+        res["y"] = np.nan
+
+        # for MultiIndex in res find "observation" level and
+        # insert x, y if gtype=point for that observation
+        if "observation" in res.index.names:
+            idx_names = res.index.names
+            res = res.reset_index()
+            for cmp in cc:
+                if cmp.gtype == "point":
+                    res.loc[res.observation == cmp.name, "x"] = cmp.x
+                    res.loc[res.observation == cmp.name, "y"] = cmp.y
+            res = res.set_index(idx_names)
+        return res
+
     def _add_as_col_if_not_in_index(
         self, df, skilldf, fields=["model", "observation", "quantity"]
     ):
-        """Add a field to skilldf if unique in df"""
+        """skill() helper: Add a field to skilldf if unique in df"""
         for field in reversed(fields):
             if (field == "model") and (self.n_models <= 1):
                 continue
@@ -681,7 +703,8 @@ def gridded_skill(
         --------
         >>> import modelskill as ms
         >>> cc = ms.match([HKNA,EPL,c2], mr)  # with satellite track measurements
-        >>> cc.gridded_skill(metrics='bias')
+        >>> gs = cc.gridded_skill(metrics='bias')
+        >>> gs.data
         <xarray.Dataset>
         Dimensions:      (x: 5, y: 5)
         Coordinates:
@@ -692,8 +715,8 @@ def gridded_skill(
             n            (x, y) int32 3 0 0 14 37 17 50 36 72 ... 0 0 15 20 0 0 0 28 76
             bias         (x, y) float64 -0.02626 nan nan ... nan 0.06785 -0.1143
 
-        >>> ds = cc.gridded_skill(binsize=0.5)
-        >>> ds.coords
+        >>> gs = cc.gridded_skill(binsize=0.5)
+        >>> gs.data.coords
         Coordinates:
             observation   'alti'
         * x            (x) float64 -1.5 -0.5 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5
@@ -788,9 +811,9 @@ def mean_skill(
         >>> cc.mean_skill().round(2)
                       n  bias  rmse  urmse   mae    cc    si    r2
         HKZN_local  564 -0.09  0.31   0.28  0.24  0.97  0.09  0.99
-        >>> s = cc.mean_skill(weights="equal")
-        >>> s = cc.mean_skill(weights="points")
-        >>> s = cc.mean_skill(weights={"EPL": 2.0}) # more weight on EPL, others=1.0
+        >>> sk = cc.mean_skill(weights="equal")
+        >>> sk = cc.mean_skill(weights="points")
+        >>> sk = cc.mean_skill(weights={"EPL": 2.0}) # more weight on EPL, others=1.0
         """
 
         # TODO remove in v1.1
@@ -819,13 +842,13 @@ def mean_skill(
 
         # skill assessment
         pmetrics = _parse_metric(metrics)
-        s = cc.skill(metrics=pmetrics)
-        if s is None:
+        sk = cc.skill(metrics=pmetrics)
+        if sk is None:
             return None
-        skilldf = s.to_dataframe()
+        skilldf = sk.to_dataframe()
 
         # weights
-        weights = cc._parse_weights(weights, s.obs_names)
+        weights = cc._parse_weights(weights, sk.obs_names)
         skilldf["weights"] = (
             skilldf.n if weights is None else np.tile(weights, len(mod_names))  # type: ignore
         )
@@ -946,7 +969,7 @@ def _parse_weights(self, weights, observations):
                 weights = np.ones(n_obs)  # equal weight to all
             elif isinstance(weights, dict):
                 w_dict = weights
-                weights = [w_dict.get(name, 1.0) for name in (self.obs_names)]
+                weights = [w_dict.get(name, 1.0) for name in observations]
 
             elif isinstance(weights, str):
                 if weights.lower() == "equal":

@@ -497,13 +497,13 @@ def taylor(
 
         metrics = [mtr._std_obs, mtr._std_mod, mtr.cc]
         skill_func = self.cc.mean_skill if aggregate_observations else self.cc.skill
-        s = skill_func(
+        sk = skill_func(
             metrics=metrics,  # type: ignore
         )
-        if s is None:
+        if sk is None:
             return
 
-        df = s.to_dataframe()
+        df = sk.to_dataframe()
         ref_std = 1.0 if normalize_std else df.iloc[0]["_std_obs"]
 
         if isinstance(df.index, pd.MultiIndex):

@@ -973,9 +973,8 @@ def skill(
 
         df = cmp._to_long_dataframe()
         res = _groupby_df(df, by, metrics)
-        res["x"] = df.groupby(by=by, observed=False).x.first()
-        res["y"] = df.groupby(by=by, observed=False).y.first()
-        # TODO: set x,y to NaN if TrackObservation
+        res["x"] = np.nan if self.gtype == "track" else cmp.x
+        res["y"] = np.nan if self.gtype == "track" else cmp.y
         res = self._add_as_col_if_not_in_index(df, skilldf=res)
         return SkillTable(res)
 
@@ -1107,8 +1106,8 @@ def gridded_skill(
             n            (x, y) int32 3 0 0 14 37 17 50 36 72 ... 0 0 15 20 0 0 0 28 76
             bias         (x, y) float64 -0.02626 nan nan ... nan 0.06785 -0.1143
 
-        >>> ds = cc.gridded_skill(binsize=0.5)
-        >>> ds.coords
+        >>> gs = cc.gridded_skill(binsize=0.5)
+        >>> gs.data.coords
         Coordinates:
             observation   'alti'
         * x            (x) float64 -1.5 -0.5 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5
@@ -1184,6 +1183,28 @@ def remove_bias(self, correct="Model") -> Comparer:
             )
         return cmp
 
+    def to_dataframe(self) -> pd.DataFrame:
+        """Convert matched data to pandas DataFrame
+
+        Include x, y coordinates only if gtype=track
+
+        Returns
+        -------
+        pd.DataFrame
+            data as a pandas DataFrame
+        """
+        if self.gtype == str(GeometryType.POINT):
+            # we remove the scalar coordinate variables as they
+            # will otherwise be columns in the dataframe
+            return self.data.drop_vars(["x", "y", "z"]).to_dataframe()
+        elif self.gtype == str(GeometryType.TRACK):
+            df = self.data.drop_vars(["z"]).to_dataframe()
+            # make sure that x, y cols are first
+            cols = ["x", "y"] + [c for c in df.columns if c not in ["x", "y"]]
+            return df[cols]
+        else:
+            raise NotImplementedError(f"Unknown gtype: {self.gtype}")
+
     def save(self, filename: Union[str, Path]) -> None:
         """Save to netcdf file
 

@@ -51,19 +51,16 @@ def _groupby_df(
 ) -> pd.DataFrame:
     def calc_metrics(group):
         row = {}
-        # row["x"] = group.x.first() if group.x.nunique() == 1 else np.nan
-        # row["y"] = group.y.first() if group.y.nunique() == 1 else np.nan
         row["n"] = len(group)
         for metric in metrics:
             row[metric.__name__] = metric(group.obs_val, group.mod_val)
         return pd.Series(row)
 
-    # .drop(columns=["x", "y"])
     if _dt_in_by(by):
         df, by = _add_dt_to_df(df, by)
 
     # sort=False to avoid re-ordering compared to original cc (also for performance)
-    res = df.groupby(by=by, observed=False).apply(calc_metrics)
+    res = df.groupby(by=by, observed=False, sort=False).apply(calc_metrics)
 
     if n_min:
         # nan for all cols but n

@@ -185,7 +185,14 @@ def match(
     gtype=None,
     max_model_gap=None,
 ):
-    """Compare observations and model results
+    """Match observation and model result data in space and time
+
+    NOTE: In case of multiple model results with different time coverage,
+    only the _overlapping_ time period will be used! (intersection)
+
+    NOTE: In case of multiple observations, multiple models can _only_
+    be matched if they are _all_ of SpatialField type, e.g. DfsuModelResult
+    or GridModelResult.
 
     Parameters
     ----------
@@ -468,9 +475,14 @@ def _select_overlapping_trackdata_with_tolerance(
     df = mod_df.join(obs_df, how="inner", lsuffix="_mod", rsuffix="_obs")
 
     # 2. remove model points outside observation track
+    n_points = len(df)
     keep_x = np.abs((df.x_mod - df.x_obs)) < spatial_tolerance
     keep_y = np.abs((df.y_mod - df.y_obs)) < spatial_tolerance
     df = df[keep_x & keep_y]
+    if n_points_removed := n_points - len(df):
+        warnings.warn(
+            f"Removed {n_points_removed} model points outside observation track (spatial_tolerance={spatial_tolerance})"
+        )
     return mri.data.sel(time=df.index)
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,7 +7,6 @@ on: @@
       push:
         branches: [ main]
       pull_request:
-        branches: [ main ]
     jobs:
       build:
@@ Expand Down @@