dimensionalOS · leshy · Dec 16, 2025 · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025
diff --git a/data/.lfs/models_mobileclip.tar.gz b/data/.lfs/models_mobileclip.tar.gz
diff --git a/data/.lfs/models_torchreid.tar.gz b/data/.lfs/models_torchreid.tar.gz
diff --git a/dimos/conftest.py b/dimos/conftest.py
@@ -96,9 +96,14 @@ def monitor_threads(request):
             t for t in threading.enumerate() if t.ident in new_thread_ids and t.name != "MainThread"
         ]
 
-        # Filter out expected persistent threads from Dask that are shared globally
+        # Filter out expected persistent threads that are shared globally
         # These threads are intentionally left running and cleaned up on process exit
-        expected_persistent_thread_prefixes = ["Dask-Offload"]
+        expected_persistent_thread_prefixes = [
+            "Dask-Offload",
+            # HuggingFace safetensors conversion thread - no user cleanup API
+            # https://github.com/huggingface/transformers/issues/29513
+            "Thread-auto_conversion",
+        ]
         new_threads = [
             t
             for t in new_threads

diff --git a/dimos/models/__init__.py b/dimos/models/__init__.py
@@ -0,0 +1,3 @@
+from dimos.models.base import HuggingFaceModel, LocalModel
+
+__all__ = ["LocalModel", "HuggingFaceModel"]
diff --git a/dimos/models/base.py b/dimos/models/base.py
@@ -0,0 +1,199 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Base classes for local GPU models."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from functools import cached_property
+from typing import Annotated, Any
+
+import torch
+
+from dimos.core.resource import Resource
+from dimos.protocol.service import Configurable  # type: ignore[attr-defined]
+
+# Device string type - 'cuda', 'cpu', 'cuda:0', 'cuda:1', etc.
+DeviceType = Annotated[str, "Device identifier (e.g., 'cuda', 'cpu', 'cuda:0')"]
+
+
+@dataclass
+class LocalModelConfig:
+    device: DeviceType = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype: torch.dtype = torch.float32
+    warmup: bool = False
+    autostart: bool = False
+
+
+class LocalModel(Resource, Configurable[LocalModelConfig]):
+    """Base class for all local GPU/CPU models.
+
+    Implements Resource interface for lifecycle management.
+
+    Subclasses MUST override:
+        - _model: @cached_property that loads and returns the model
+
+    Subclasses MAY override:
+        - start() for custom initialization logic
+        - stop() for custom cleanup logic
+    """
+
+    default_config = LocalModelConfig
+    config: LocalModelConfig
+
+    def __init__(self, **kwargs: object) -> None:
+        """Initialize local model with device and dtype configuration.
+
+        Args:
+            device: Device to run on ('cuda', 'cpu', 'cuda:0', etc.).
+                    Auto-detects CUDA availability if None.
+            dtype: Model dtype (torch.float16, torch.bfloat16, etc.).
+                   Uses class _default_dtype if None.
+            autostart: If True, immediately load the model.
+                       If False (default), model loads lazily on first use.
+        """
+        super().__init__(**kwargs)
+        if self.config.warmup or self.config.autostart:
+            self.start()
+
+    @property
+    def device(self) -> str:
+        """The device this model runs on."""
+        return self.config.device
+
+    @property
+    def dtype(self) -> torch.dtype:
+        """The dtype used by this model."""
+        return self.config.dtype
+
+    @cached_property
+    def _model(self) -> Any:
+        """Lazily loaded model. Subclasses must override this property."""
+        raise NotImplementedError(f"{self.__class__.__name__} must override _model property")
+
+    def start(self) -> None:
+        """Load the model (Resource interface).
+
+        Subclasses should override to add custom initialization.
+        """
+        _ = self._model
+
+    def stop(self) -> None:
+        """Release model and free GPU memory (Resource interface).
+
+        Subclasses should override and call super().stop() for custom cleanup.
+        """
+        import gc
+
+        if "_model" in self.__dict__:
+            del self.__dict__["_model"]
+
+        # Reset torch.compile caches to free memory from compiled models
+        # See: https://github.com/pytorch/pytorch/issues/105181
+        try:
+            import torch._dynamo
+
+            torch._dynamo.reset()
+        except (ImportError, AttributeError):
+            pass
+
+        gc.collect()
+        if self.config.device.startswith("cuda") and torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+    def _ensure_cuda_initialized(self) -> None:
+        """Initialize CUDA context to prevent cuBLAS allocation failures.
+
+        Some models (CLIP, TorchReID) fail if they are the first to use CUDA.
+        Call this before model loading if needed.
+        """
+        if self.config.device.startswith("cuda") and torch.cuda.is_available():
+            try:
+                _ = torch.zeros(1, 1, device="cuda") @ torch.zeros(1, 1, device="cuda")
+                torch.cuda.synchronize()
+            except Exception:
+                pass
+
+
+@dataclass
+class HuggingFaceModelConfig(LocalModelConfig):
+    model_name: str = ""
+    trust_remote_code: bool = True
+    dtype: torch.dtype = torch.float16
+
+
+class HuggingFaceModel(LocalModel):
+    """Base class for HuggingFace transformers-based models.
+
+    Provides common patterns for loading models from the HuggingFace Hub
+    using from_pretrained().
+
+    Subclasses SHOULD set:
+        - _model_class: The AutoModel class to use (e.g., AutoModelForCausalLM)
+
+    Subclasses MAY override:
+        - _model: @cached_property for custom model loading
+    """
+
+    default_config = HuggingFaceModelConfig
+    config: HuggingFaceModelConfig
+    _model_class: Any = None  # e.g., AutoModelForCausalLM
+
+    @property
+    def model_name(self) -> str:
+        """The HuggingFace model identifier."""
+        return self.config.model_name
+
+    @cached_property
+    def _model(self) -> Any:
+        """Load the HuggingFace model using _model_class.
+
+        Override this property for custom loading logic.
+        """
+        if self._model_class is None:
+            raise NotImplementedError(
+                f"{self.__class__.__name__} must set _model_class or override _model property"
+            )
+        model = self._model_class.from_pretrained(
+            self.config.model_name,
+            trust_remote_code=self.config.trust_remote_code,
+            torch_dtype=self.config.dtype,
+        )
+        return model.to(self.config.device)
+
+    def _move_inputs_to_device(
+        self,
+        inputs: dict[str, torch.Tensor],
+        apply_dtype: bool = True,
+    ) -> dict[str, torch.Tensor]:
+        """Move input tensors to model device with appropriate dtype.
+
+        Args:
+            inputs: Dictionary of input tensors
+            apply_dtype: Whether to apply model dtype to floating point tensors
+
+        Returns:
+            Dictionary with tensors moved to device
+        """
+        result = {}
+        for k, v in inputs.items():
+            if isinstance(v, torch.Tensor):
+                if apply_dtype and v.is_floating_point():
+                    result[k] = v.to(self.config.device, dtype=self.config.dtype)
+                else:
+                    result[k] = v.to(self.config.device)
+            else:
+                result[k] = v
+        return result
diff --git a/dimos/models/depth/metric3d.py b/dimos/models/depth/metric3d.py
@@ -12,32 +12,50 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from dataclasses import dataclass, field
+from functools import cached_property
+from typing import Any
+
 import cv2
 import torch
 
-# May need to add this back for import to work
-# external_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'external', 'Metric3D'))
-# if external_path not in sys.path:
-#     sys.path.append(external_path)
+from dimos.models.base import LocalModel, LocalModelConfig
+
+
+@dataclass
+class Metric3DConfig(LocalModelConfig):
+    """Configuration for Metric3D depth estimation model."""
+
+    camera_intrinsics: list[float] = field(default_factory=lambda: [500.0, 500.0, 320.0, 240.0])
+    """Camera intrinsics [fx, fy, cx, cy]."""
+
+    gt_depth_scale: float = 256.0
+    """Scale factor for ground truth depth."""
+
+    device: str = "cuda" if torch.cuda.is_available() else "cpu"
+    """Device to run the model on."""
 
 
-class Metric3D:
-    def __init__(self, camera_intrinsics=None, gt_depth_scale: float=256.0) -> None:  # type: ignore[no-untyped-def]
-        # self.conf = get_config("zoedepth", "infer")
-        # self.depth_model = build_model(self.conf)
-        self.depth_model = torch.hub.load(  # type: ignore[no-untyped-call]
+class Metric3D(LocalModel):
+    default_config = Metric3DConfig
+    config: Metric3DConfig
+
+    def __init__(self, **kwargs: object) -> None:
+        super().__init__(**kwargs)
+        self.intrinsic = self.config.camera_intrinsics
+        self.intrinsic_scaled: list[float] | None = None
+        self.gt_depth_scale = self.config.gt_depth_scale
+        self.pad_info: list[int] | None = None
+        self.rgb_origin: Any = None
+
+    @cached_property
+    def _model(self) -> Any:
+        model = torch.hub.load(  # type: ignore[no-untyped-call]
             "yvanyin/metric3d", "metric3d_vit_small", pretrain=True
-        ).cuda()
-        if torch.cuda.device_count() > 1:
-            print(f"Using {torch.cuda.device_count()} GPUs!")
-            # self.depth_model = torch.nn.DataParallel(self.depth_model)
-        self.depth_model.eval()
-
-        self.intrinsic = camera_intrinsics
-        self.intrinsic_scaled = None
-        self.gt_depth_scale = gt_depth_scale  # And this
-        self.pad_info = None
-        self.rgb_origin = None
+        )
+        model = model.to(self.device)
+        model.eval()
+        return model
 
     """
     Input: Single image in RGB format
@@ -54,7 +72,7 @@ def update_intrinsic(self, intrinsic):  # type: ignore[no-untyped-def]
         self.intrinsic = intrinsic
         print(f"Intrinsics updated to: {self.intrinsic}")
 
-    def infer_depth(self, img, debug: bool=False):  # type: ignore[no-untyped-def]
+    def infer_depth(self, img, debug: bool = False):  # type: ignore[no-untyped-def]
         if debug:
             print(f"Input image: {img}")
         try:
@@ -70,7 +88,7 @@ def infer_depth(self, img, debug: bool=False):  # type: ignore[no-untyped-def]
         img = self.rescale_input(img, self.rgb_origin)  # type: ignore[no-untyped-call]
 
         with torch.no_grad():
-            pred_depth, confidence, output_dict = self.depth_model.inference({"input": img})
+            pred_depth, confidence, output_dict = self._model.inference({"input": img})
 
         # Convert to PIL format
         depth_image = self.unpad_transform_depth(pred_depth)  # type: ignore[no-untyped-call]
@@ -125,7 +143,7 @@ def rescale_input(self, rgb, rgb_origin):  # type: ignore[no-untyped-def]
         std = torch.tensor([58.395, 57.12, 57.375]).float()[:, None, None]
         rgb = torch.from_numpy(rgb.transpose((2, 0, 1))).float()
         rgb = torch.div((rgb - mean), std)
-        rgb = rgb[None, :, :, :].cuda()
+        rgb = rgb[None, :, :, :].to(self.device)
         return rgb
 
     def unpad_transform_depth(self, pred_depth):  # type: ignore[no-untyped-def]
@@ -138,7 +156,9 @@ def unpad_transform_depth(self, pred_depth):  # type: ignore[no-untyped-def]
 
         # upsample to original size
         pred_depth = torch.nn.functional.interpolate(
-            pred_depth[None, None, :, :], self.rgb_origin.shape[:2], mode="bilinear"  # type: ignore[attr-defined]
+            pred_depth[None, None, :, :],
+            self.rgb_origin.shape[:2],
+            mode="bilinear",  # type: ignore[attr-defined]
         ).squeeze()
         ###################### canonical camera space ######################
 
@@ -150,16 +170,11 @@ def unpad_transform_depth(self, pred_depth):  # type: ignore[no-untyped-def]
         pred_depth = torch.clamp(pred_depth, 0, 1000)
         return pred_depth
 
-    """Set new intrinsic value."""
-
-    def update_intrinsic(self, intrinsic) -> None:  # type: ignore[no-redef, no-untyped-def]
-        self.intrinsic = intrinsic
-
     def eval_predicted_depth(self, depth_file, pred_depth) -> None:  # type: ignore[no-untyped-def]
         if depth_file is not None:
             gt_depth = cv2.imread(depth_file, -1)
             gt_depth = gt_depth / self.gt_depth_scale
-            gt_depth = torch.from_numpy(gt_depth).float().cuda()  # type: ignore[assignment]
+            gt_depth = torch.from_numpy(gt_depth).float().to(self.device)  # type: ignore[assignment]
             assert gt_depth.shape == pred_depth.shape
 
             mask = gt_depth > 1e-8
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from dimos.models.base import HuggingFaceModel, LocalModel

		__all__ = ["LocalModel", "HuggingFaceModel"]