diff --git a/src/av2/evaluation/scene_flow/eval.py b/src/av2/evaluation/scene_flow/eval.py
index 8824a711..d979a3f3 100644
--- a/src/av2/evaluation/scene_flow/eval.py
+++ b/src/av2/evaluation/scene_flow/eval.py
@@ -2,9 +2,11 @@
 
 from __future__ import annotations
 
+import zipfile
 from collections import defaultdict
 from pathlib import Path
-from typing import Any, DefaultDict, Dict, Final, List, Tuple, Union, cast
+from typing import Any, Callable, DefaultDict, Dict, Final, List, Optional, Tuple, Union, cast
+from zipfile import ZipFile
 
 import click
 import numpy as np
@@ -288,26 +290,25 @@ def compute_metrics(
     return results
 
 
-def evaluate_directories(annotations_dir: Path, predictions_dir: Path) -> pd.DataFrame:
-    """Run the evaluation on predictions and labels saved to disk.
+def evaluate_predictions(annotations_dir: Path, get_prediction: Callable[[Path], pd.DataFrame]) -> pd.DataFrame:
+    """Run the evaluation on predictions and labels.
 
     Args:
         annotations_dir: Path to the directory containing the annotation files produced by `make_annotation_files.py`.
-        predictions_dir: Path to the prediction files in submission format.
+        get_prediction: Function that retrieves a predictions DataFrame for a given relative
+                        annotation filepath, or None if no prediction exists.
 
     Returns:
         DataFrame containing the average metrics on each subset of each example.
     """
     results: DefaultDict[str, List[Any]] = defaultdict(list)
-    annotation_files = list(annotations_dir.rglob("*.feather"))
+    annotation_files = sorted(annotations_dir.rglob("*.feather"))
     for anno_file in track(annotation_files, description="Evaluating..."):
         gts = pd.read_feather(anno_file)
-        name: str = str(anno_file.relative_to(annotations_dir))
-        pred_file = predictions_dir / name
-        if not pred_file.exists():
-            print(f"Warning: File {name} is missing!")
+        name: Path = anno_file.relative_to(annotations_dir)
+        pred = get_prediction(name)
+        if pred is None:
             continue
-        pred = pd.read_feather(pred_file)
         current_example_results = compute_metrics(
             pred[list(constants.FLOW_COLUMNS)].to_numpy().astype(float),
             pred["is_dynamic"].to_numpy().astype(bool),
@@ -319,7 +320,7 @@ def evaluate_directories(annotations_dir: Path, predictions_dir: Path) -> pd.Dat
             constants.FOREGROUND_BACKGROUND_BREAKDOWN,
         )
         num_subsets = len(list(current_example_results.values())[0])
-        results["Example"] += [name for _ in range(num_subsets)]
+        results["Example"] += [str(name) for _ in range(num_subsets)]
         for m in current_example_results:
             results[m] += current_example_results[m]
     df = pd.DataFrame(
@@ -331,6 +332,68 @@ def evaluate_directories(annotations_dir: Path, predictions_dir: Path) -> pd.Dat
     return df
 
 
+def get_prediction_from_directory(annotation_name: Path, predictions_dir: Path) -> Optional[pd.DataFrame]:
+    """Get the prediction corresponding annotation from a directory of prediction files.
+
+    Args:
+        annotation_name: Relative path to the annotation file.
+        predictions_dir: Path to the predicition files in submission_format.
+
+    Returns:
+        DataFrame contating the predictions for that annotation file or None if it does not exist.
+    """
+    pred_file = predictions_dir / annotation_name
+    if not pred_file.exists():
+        return None
+    pred = pd.read_feather(pred_file)
+    return pred
+
+
+def get_prediction_from_zipfile(annotation_name: Path, predictions_zip: Path) -> Optional[pd.DataFrame]:
+    """Get the prediction corresponding annotation from a zip archive of prediction files.
+
+    Args:
+        annotation_name: Relative path to the annotation file.
+        predictions_zip: Path to the prediction files in a zip archive.
+
+    Returns:
+        DataFrame contating the predictions for that annotation file or None if it does not exist.
+    """
+    with ZipFile(predictions_zip, "r") as zf:
+        name = annotation_name.as_posix()
+        path = zipfile.Path(zf, name)
+        if path.exists():
+            return pd.read_feather(zf.open(name))
+        else:
+            return None
+
+
+def evaluate_directories(annotations_dir: Path, predictions_dir: Path) -> pd.DataFrame:
+    """Run the evaluation on predictions and labels saved to disk.
+
+    Args:
+        annotations_dir: Path to the directory containing the annotation files produced by `make_annotation_files.py`.
+        predictions_dir: Path to the prediction files in submission format.
+
+    Returns:
+        DataFrame containing the average metrics on each subset of each example.
+    """
+    return evaluate_predictions(annotations_dir, lambda n: get_prediction_from_directory(n, predictions_dir))
+
+
+def evaluate_zip(annotations_dir: Path, predictions_zip: Path) -> pd.DataFrame:
+    """Run the evaluation on predictions and labels saved to disk.
+
+    Args:
+        annotations_dir: Path to the directory containing the annotation files produced by `make_annotation_files.py`.
+        predictions_zip: Path to the prediction files in a zip archive.
+
+    Returns:
+        DataFrame containing the average metrics on each subset of each example.
+    """
+    return evaluate_predictions(annotations_dir, lambda n: get_prediction_from_zipfile(n, predictions_zip))
+
+
 def results_to_dict(frame: pd.DataFrame) -> Dict[str, float]:
     """Convert a results DataFrame to a dictionary of whole dataset metrics.
 
diff --git a/src/av2/evaluation/scene_flow/make_submission_archive.py b/src/av2/evaluation/scene_flow/make_submission_archive.py
index 23cbe253..175f336b 100644
--- a/src/av2/evaluation/scene_flow/make_submission_archive.py
+++ b/src/av2/evaluation/scene_flow/make_submission_archive.py
@@ -29,7 +29,7 @@ def validate(submission_dir: Path, mask_file: Path) -> None:
             if not input_file.exists():
                 raise FileNotFoundError(f"{input_file} not found in submission directory")
             pred = pd.read_feather(input_file)
-            expected_num_points = pd.read_feather(masks.open(filename)).sum()
+            expected_num_points = pd.read_feather(masks.open(filename)).sum().item()
 
             for c in SUBMISSION_COLUMNS:
                 if c not in pred.columns:
diff --git a/tests/unit/evaluation/scene_flow/test_sf_submission_pipeline.py b/tests/unit/evaluation/scene_flow/test_sf_submission_pipeline.py
index 0080a654..8c863523 100644
--- a/tests/unit/evaluation/scene_flow/test_sf_submission_pipeline.py
+++ b/tests/unit/evaluation/scene_flow/test_sf_submission_pipeline.py
@@ -2,16 +2,33 @@
 
 import tempfile
 from pathlib import Path
+from typing import Final
+from zipfile import ZipFile
 
 import numpy as np
+import pandas as pd
 
 import av2.evaluation.scene_flow.eval as eval
 from av2.evaluation.scene_flow.example_submission import example_submission
 from av2.evaluation.scene_flow.make_annotation_files import make_annotation_files
 from av2.evaluation.scene_flow.make_mask_files import make_mask_files
-from av2.evaluation.scene_flow.make_submission_archive import make_submission_archive
+from av2.evaluation.scene_flow.make_submission_archive import make_submission_archive, validate
+from av2.evaluation.scene_flow.utils import compute_eval_point_mask
+from av2.torch.data_loaders.scene_flow import SceneFlowDataloader
 
-_TEST_DATA_ROOT = Path(__file__).resolve().parent.parent.parent
+_TEST_DATA_ROOT: Final = Path(__file__).resolve().parent.parent.parent
+
+
+def _zipdir(directory: Path, output_file: Path) -> None:
+    """Zip a directory into output_file.
+
+    Args:
+        directory: The directory to recursively zip up.
+        output_file: The name of the output archive.
+    """
+    with ZipFile(output_file, "w") as zf:
+        for f in directory.rglob("**"):
+            zf.write(f, arcname=str(f.relative_to(directory)))
 
 
 def test_submission() -> None:
@@ -45,4 +62,135 @@ def test_submission() -> None:
                     assert results[metric] < 1e-4
 
         output_file = test_dir / "submission.zip"
-        make_submission_archive(str(predictions_dir), str(mask_file), str(output_file))
+        success = make_submission_archive(str(predictions_dir), str(mask_file), str(output_file))
+        assert success
+        assert output_file.stat().st_size > 0
+
+        annotation_files = list(annotations_dir.rglob("*.feather"))
+        print([anno_file.relative_to(annotations_dir).as_posix() for anno_file in annotation_files])
+        with ZipFile(output_file, "r") as zf:
+            files = {f.filename for f in zf.filelist}
+        print(files)
+
+        results_zip = eval.results_to_dict(eval.evaluate_zip(annotations_dir, output_file))
+        for metric in results:
+            assert np.allclose(results[metric], results_zip[metric], equal_nan=True)
+
+        empty_predictions_dir = test_dir / "bad_output_1"
+        empty_predictions_dir.mkdir()
+        success = make_submission_archive(str(empty_predictions_dir), str(mask_file), str(output_file))
+        assert not success
+
+        failed = False
+        try:
+            validate(empty_predictions_dir, mask_file)
+        except FileNotFoundError:
+            failed = True
+        assert failed
+
+        # Missing a column
+        log_id = "7fab2350-7eaf-3b7e-a39d-6937a4c1bede"
+        timestamp_ns = 315966265259836000
+        data_loader = SceneFlowDataloader(_TEST_DATA_ROOT, "test_data", "val")
+        sweep_0, sweep_1, s1_SE3_s0, _ = data_loader[0]
+        mask = compute_eval_point_mask((sweep_0, sweep_1, s1_SE3_s0, None))
+        npts = mask.sum()
+        bad_df = pd.DataFrame(
+            {
+                "flow_tx_m": np.zeros(npts, dtype=np.float16),
+                "flow_ty_m": np.zeros(npts, dtype=np.float16),
+                "flow_tz_m": np.zeros(npts, dtype=np.float16),
+            }
+        )
+        bad_cols_predictions_dir = test_dir / "bad_output_2" / log_id
+        bad_cols_predictions_dir.mkdir(parents=True, exist_ok=True)
+        bad_df.to_feather(bad_cols_predictions_dir / f"{timestamp_ns}.feather")
+        failed = False
+        try:
+            validate(bad_cols_predictions_dir.parent, mask_file)
+        except ValueError as e:
+            print(e)
+            assert "contain is_dynamic" in str(e)
+            failed = True
+        assert failed
+
+        # Wrong dynamic column type
+        bad_df = pd.DataFrame(
+            {
+                "flow_tx_m": np.zeros(npts, dtype=np.float16),
+                "flow_ty_m": np.zeros(npts, dtype=np.float16),
+                "flow_tz_m": np.zeros(npts, dtype=np.float16),
+                "is_dynamic": np.zeros(npts, dtype=np.float16),
+            }
+        )
+        bad_type_predictions_dir = test_dir / "bad_output_3" / log_id
+        bad_type_predictions_dir.mkdir(parents=True, exist_ok=True)
+        bad_df.to_feather(bad_type_predictions_dir / f"{timestamp_ns}.feather")
+        failed = False
+        try:
+            validate(bad_type_predictions_dir.parent, mask_file)
+        except ValueError as e:
+            assert "column is_dynamic" in str(e)
+            failed = True
+        assert failed
+
+        # Wrong flow column type
+        bad_df = pd.DataFrame(
+            {
+                "flow_tx_m": np.zeros(npts, dtype=np.float16),
+                "flow_ty_m": np.zeros(npts, dtype=np.float16),
+                "flow_tz_m": np.zeros(npts, dtype=np.float32),
+                "is_dynamic": np.zeros(npts, dtype=bool),
+            }
+        )
+        bad_type_2_predictions_dir = test_dir / "bad_output_4" / log_id
+        bad_type_2_predictions_dir.mkdir(exist_ok=True, parents=True)
+        bad_df.to_feather(bad_type_2_predictions_dir / f"{timestamp_ns}.feather")
+        failed = False
+        try:
+            validate(bad_type_2_predictions_dir.parent, mask_file)
+        except ValueError as e:
+            assert "column flow_tz_m" in str(e)
+            failed = True
+        assert failed
+
+        # extra column
+        bad_df = pd.DataFrame(
+            {
+                "flow_tx_m": np.zeros(npts, dtype=np.float16),
+                "flow_ty_m": np.zeros(npts, dtype=np.float16),
+                "flow_tz_m": np.zeros(npts, dtype=np.float16),
+                "is_dynamic": np.zeros(npts, dtype=bool),
+                "is_static": np.zeros(npts, dtype=bool),
+            }
+        )
+        extra_col_predictions_dir = test_dir / "bad_output_5" / log_id
+        extra_col_predictions_dir.mkdir(parents=True, exist_ok=True)
+        bad_df.to_feather(extra_col_predictions_dir / f"{timestamp_ns}.feather")
+        failed = False
+        try:
+            validate(extra_col_predictions_dir.parent, mask_file)
+        except ValueError as e:
+            assert "extra" in str(e)
+            failed = True
+        assert failed
+
+        # wrong length
+        bad_df = pd.DataFrame(
+            {
+                "flow_tx_m": np.zeros(npts + 1, dtype=np.float16),
+                "flow_ty_m": np.zeros(npts + 1, dtype=np.float16),
+                "flow_tz_m": np.zeros(npts + 1, dtype=np.float16),
+                "is_dynamic": np.zeros(npts + 1, dtype=bool),
+            }
+        )
+        wrong_len_predictions_dir = test_dir / "bad_output_6" / log_id
+        wrong_len_predictions_dir.mkdir(exist_ok=True, parents=True)
+        bad_df.to_feather(wrong_len_predictions_dir / f"{timestamp_ns}.feather")
+        failed = False
+        try:
+            validate(wrong_len_predictions_dir.parent, mask_file)
+        except ValueError as e:
+            assert "rows" in str(e)
+            failed = True
+        assert failed