Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
7d9aec1
captioner modules implemented in models/vl, flake.nix fixes
leshy Dec 8, 2025
4186332
model structure rework
leshy Dec 8, 2025
c57faec
refactor
leshy Dec 8, 2025
a099c45
bugfix
leshy Dec 8, 2025
e63f678
removed double update_intrinsic on metric3d
leshy Dec 8, 2025
9d10160
mypy
leshy Dec 8, 2025
ea802c2
typing fixes
leshy Dec 8, 2025
de44479
embedding models rewrite
leshy Dec 9, 2025
e9830a9
mobileclip preprocess accessor rewrite
leshy Dec 9, 2025
3d27328
torch reid models added to lfs, reid/embedding model cleanup
leshy Dec 9, 2025
0ef2af4
mobileclip upload
leshy Dec 9, 2025
a2e4640
batch vlm querying
leshy Dec 9, 2025
6b795f6
moondream batch queries and tests
leshy Dec 9, 2025
4c6b5a1
type fixes
leshy Dec 9, 2025
3777ba5
proper model resource management, speed tests, auto-resizing, plotting
leshy Dec 9, 2025
cc9be60
type fixes
leshy Dec 9, 2025
82b82a1
tests, mypy, correct cleanup
leshy Dec 9, 2025
73670f3
metric3d tests
leshy Dec 9, 2025
7bf48ab
attempting to remove dead code
leshy Dec 9, 2025
5381ff3
scaling bugfix for visual models
leshy Dec 9, 2025
d6b3127
docstring fix
leshy Dec 9, 2025
fc5ac82
plotext dep
leshy Dec 9, 2025
4eb9b7b
open clip dep
leshy Dec 9, 2025
a176996
open clip dep fix
leshy Dec 9, 2025
38344a2
gdown dep
leshy Dec 9, 2025
be24f12
tensorboard dep
leshy Dec 9, 2025
5b79bfb
Merge branch 'dev' into embedding_rewrite
leshy Dec 10, 2025
cc99a4c
typing fixes for detections and plotter
leshy Dec 10, 2025
1223df2
person tracker typing fix
leshy Dec 10, 2025
e960cb9
py 3.10 typing fix
leshy Dec 10, 2025
0f06b1e
last type fix
leshy Dec 10, 2025
d5e2ad3
Merge branch 'dev' into embedding_rewrite
leshy Dec 11, 2025
38f129a
ignore missing imports (for ros deps)
leshy Dec 11, 2025
557e5f5
nicer init for florence
leshy Dec 11, 2025
4462d20
Merge branch 'dev' into embedding_rewrite
leshy Dec 11, 2025
417a1ea
Merge branch 'dev' into embedding_rewrite
leshy Dec 11, 2025
7935a0b
type fixes
leshy Dec 12, 2025
a52ca75
mypy ignore ros/mujoco
leshy Dec 12, 2025
93724ae
addressing PR comments
leshy Dec 16, 2025
06240e5
image is a fixture
leshy Dec 16, 2025
f4fa6c1
captioner fixtures
leshy Dec 16, 2025
2f2cb8b
all PR comments addressed
leshy Dec 16, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions data/.lfs/models_mobileclip.tar.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions data/.lfs/models_torchreid.tar.gz
Git LFS file not shown
9 changes: 7 additions & 2 deletions dimos/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,14 @@ def monitor_threads(request):
t for t in threading.enumerate() if t.ident in new_thread_ids and t.name != "MainThread"
]

# Filter out expected persistent threads from Dask that are shared globally
# Filter out expected persistent threads that are shared globally
# These threads are intentionally left running and cleaned up on process exit
expected_persistent_thread_prefixes = ["Dask-Offload"]
expected_persistent_thread_prefixes = [
"Dask-Offload",
# HuggingFace safetensors conversion thread - no user cleanup API
# https://github.com/huggingface/transformers/issues/29513
"Thread-auto_conversion",
]
new_threads = [
t
for t in new_threads
Expand Down
3 changes: 3 additions & 0 deletions dimos/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from dimos.models.base import HuggingFaceModel, LocalModel

__all__ = ["LocalModel", "HuggingFaceModel"]
199 changes: 199 additions & 0 deletions dimos/models/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
# Copyright 2025 Dimensional Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Base classes for local GPU models."""

from __future__ import annotations

from dataclasses import dataclass
from functools import cached_property
from typing import Annotated, Any

import torch

from dimos.core.resource import Resource
from dimos.protocol.service import Configurable # type: ignore[attr-defined]

# Device string type - 'cuda', 'cpu', 'cuda:0', 'cuda:1', etc.
DeviceType = Annotated[str, "Device identifier (e.g., 'cuda', 'cpu', 'cuda:0')"]


@dataclass
class LocalModelConfig:
device: DeviceType = "cuda" if torch.cuda.is_available() else "cpu"
dtype: torch.dtype = torch.float32
warmup: bool = False
autostart: bool = False


class LocalModel(Resource, Configurable[LocalModelConfig]):
"""Base class for all local GPU/CPU models.

Implements Resource interface for lifecycle management.

Subclasses MUST override:
- _model: @cached_property that loads and returns the model

Subclasses MAY override:
- start() for custom initialization logic
- stop() for custom cleanup logic
"""

default_config = LocalModelConfig
config: LocalModelConfig

def __init__(self, **kwargs: object) -> None:
"""Initialize local model with device and dtype configuration.

Args:
device: Device to run on ('cuda', 'cpu', 'cuda:0', etc.).
Auto-detects CUDA availability if None.
dtype: Model dtype (torch.float16, torch.bfloat16, etc.).
Uses class _default_dtype if None.
autostart: If True, immediately load the model.
If False (default), model loads lazily on first use.
"""
super().__init__(**kwargs)
if self.config.warmup or self.config.autostart:
self.start()

@property
def device(self) -> str:
"""The device this model runs on."""
return self.config.device

@property
def dtype(self) -> torch.dtype:
"""The dtype used by this model."""
return self.config.dtype

@cached_property
def _model(self) -> Any:
"""Lazily loaded model. Subclasses must override this property."""
raise NotImplementedError(f"{self.__class__.__name__} must override _model property")

def start(self) -> None:
"""Load the model (Resource interface).

Subclasses should override to add custom initialization.
"""
_ = self._model

def stop(self) -> None:
"""Release model and free GPU memory (Resource interface).

Subclasses should override and call super().stop() for custom cleanup.
"""
import gc

if "_model" in self.__dict__:
del self.__dict__["_model"]

# Reset torch.compile caches to free memory from compiled models
# See: https://github.com/pytorch/pytorch/issues/105181
try:
import torch._dynamo

torch._dynamo.reset()
except (ImportError, AttributeError):
pass

gc.collect()
if self.config.device.startswith("cuda") and torch.cuda.is_available():
torch.cuda.empty_cache()

def _ensure_cuda_initialized(self) -> None:
"""Initialize CUDA context to prevent cuBLAS allocation failures.

Some models (CLIP, TorchReID) fail if they are the first to use CUDA.
Call this before model loading if needed.
"""
if self.config.device.startswith("cuda") and torch.cuda.is_available():
try:
_ = torch.zeros(1, 1, device="cuda") @ torch.zeros(1, 1, device="cuda")
torch.cuda.synchronize()
except Exception:
pass


@dataclass
class HuggingFaceModelConfig(LocalModelConfig):
model_name: str = ""
trust_remote_code: bool = True
dtype: torch.dtype = torch.float16


class HuggingFaceModel(LocalModel):
"""Base class for HuggingFace transformers-based models.

Provides common patterns for loading models from the HuggingFace Hub
using from_pretrained().

Subclasses SHOULD set:
- _model_class: The AutoModel class to use (e.g., AutoModelForCausalLM)

Subclasses MAY override:
- _model: @cached_property for custom model loading
"""

default_config = HuggingFaceModelConfig
config: HuggingFaceModelConfig
_model_class: Any = None # e.g., AutoModelForCausalLM

@property
def model_name(self) -> str:
"""The HuggingFace model identifier."""
return self.config.model_name

@cached_property
def _model(self) -> Any:
"""Load the HuggingFace model using _model_class.

Override this property for custom loading logic.
"""
if self._model_class is None:
raise NotImplementedError(
f"{self.__class__.__name__} must set _model_class or override _model property"
)
model = self._model_class.from_pretrained(
self.config.model_name,
trust_remote_code=self.config.trust_remote_code,
torch_dtype=self.config.dtype,
)
return model.to(self.config.device)

def _move_inputs_to_device(
self,
inputs: dict[str, torch.Tensor],
apply_dtype: bool = True,
) -> dict[str, torch.Tensor]:
"""Move input tensors to model device with appropriate dtype.

Args:
inputs: Dictionary of input tensors
apply_dtype: Whether to apply model dtype to floating point tensors

Returns:
Dictionary with tensors moved to device
"""
result = {}
for k, v in inputs.items():
if isinstance(v, torch.Tensor):
if apply_dtype and v.is_floating_point():
result[k] = v.to(self.config.device, dtype=self.config.dtype)
else:
result[k] = v.to(self.config.device)
else:
result[k] = v
return result
75 changes: 45 additions & 30 deletions dimos/models/depth/metric3d.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,32 +12,50 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from dataclasses import dataclass, field
from functools import cached_property
from typing import Any

import cv2
import torch

# May need to add this back for import to work
# external_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'external', 'Metric3D'))
# if external_path not in sys.path:
# sys.path.append(external_path)
from dimos.models.base import LocalModel, LocalModelConfig


@dataclass
class Metric3DConfig(LocalModelConfig):
"""Configuration for Metric3D depth estimation model."""

camera_intrinsics: list[float] = field(default_factory=lambda: [500.0, 500.0, 320.0, 240.0])
"""Camera intrinsics [fx, fy, cx, cy]."""

gt_depth_scale: float = 256.0
"""Scale factor for ground truth depth."""

device: str = "cuda" if torch.cuda.is_available() else "cpu"
"""Device to run the model on."""


class Metric3D:
def __init__(self, camera_intrinsics=None, gt_depth_scale: float=256.0) -> None: # type: ignore[no-untyped-def]
# self.conf = get_config("zoedepth", "infer")
# self.depth_model = build_model(self.conf)
self.depth_model = torch.hub.load( # type: ignore[no-untyped-call]
class Metric3D(LocalModel):
default_config = Metric3DConfig
config: Metric3DConfig

def __init__(self, **kwargs: object) -> None:
super().__init__(**kwargs)
self.intrinsic = self.config.camera_intrinsics
self.intrinsic_scaled: list[float] | None = None
self.gt_depth_scale = self.config.gt_depth_scale
self.pad_info: list[int] | None = None
self.rgb_origin: Any = None

@cached_property
def _model(self) -> Any:
model = torch.hub.load( # type: ignore[no-untyped-call]
"yvanyin/metric3d", "metric3d_vit_small", pretrain=True
).cuda()
if torch.cuda.device_count() > 1:
print(f"Using {torch.cuda.device_count()} GPUs!")
# self.depth_model = torch.nn.DataParallel(self.depth_model)
self.depth_model.eval()

self.intrinsic = camera_intrinsics
self.intrinsic_scaled = None
self.gt_depth_scale = gt_depth_scale # And this
self.pad_info = None
self.rgb_origin = None
)
model = model.to(self.device)
model.eval()
return model

"""
Input: Single image in RGB format
Expand All @@ -54,7 +72,7 @@ def update_intrinsic(self, intrinsic): # type: ignore[no-untyped-def]
self.intrinsic = intrinsic
print(f"Intrinsics updated to: {self.intrinsic}")

def infer_depth(self, img, debug: bool=False): # type: ignore[no-untyped-def]
def infer_depth(self, img, debug: bool = False): # type: ignore[no-untyped-def]
if debug:
print(f"Input image: {img}")
try:
Expand All @@ -70,7 +88,7 @@ def infer_depth(self, img, debug: bool=False): # type: ignore[no-untyped-def]
img = self.rescale_input(img, self.rgb_origin) # type: ignore[no-untyped-call]

with torch.no_grad():
pred_depth, confidence, output_dict = self.depth_model.inference({"input": img})
pred_depth, confidence, output_dict = self._model.inference({"input": img})

# Convert to PIL format
depth_image = self.unpad_transform_depth(pred_depth) # type: ignore[no-untyped-call]
Expand Down Expand Up @@ -125,7 +143,7 @@ def rescale_input(self, rgb, rgb_origin): # type: ignore[no-untyped-def]
std = torch.tensor([58.395, 57.12, 57.375]).float()[:, None, None]
rgb = torch.from_numpy(rgb.transpose((2, 0, 1))).float()
rgb = torch.div((rgb - mean), std)
rgb = rgb[None, :, :, :].cuda()
rgb = rgb[None, :, :, :].to(self.device)
return rgb

def unpad_transform_depth(self, pred_depth): # type: ignore[no-untyped-def]
Expand All @@ -138,7 +156,9 @@ def unpad_transform_depth(self, pred_depth): # type: ignore[no-untyped-def]

# upsample to original size
pred_depth = torch.nn.functional.interpolate(
pred_depth[None, None, :, :], self.rgb_origin.shape[:2], mode="bilinear" # type: ignore[attr-defined]
pred_depth[None, None, :, :],
self.rgb_origin.shape[:2],
mode="bilinear", # type: ignore[attr-defined]
).squeeze()
###################### canonical camera space ######################

Expand All @@ -150,16 +170,11 @@ def unpad_transform_depth(self, pred_depth): # type: ignore[no-untyped-def]
pred_depth = torch.clamp(pred_depth, 0, 1000)
return pred_depth

"""Set new intrinsic value."""

def update_intrinsic(self, intrinsic) -> None: # type: ignore[no-redef, no-untyped-def]
self.intrinsic = intrinsic

def eval_predicted_depth(self, depth_file, pred_depth) -> None: # type: ignore[no-untyped-def]
if depth_file is not None:
gt_depth = cv2.imread(depth_file, -1)
gt_depth = gt_depth / self.gt_depth_scale
gt_depth = torch.from_numpy(gt_depth).float().cuda() # type: ignore[assignment]
gt_depth = torch.from_numpy(gt_depth).float().to(self.device) # type: ignore[assignment]
assert gt_depth.shape == pred_depth.shape

mask = gt_depth > 1e-8
Expand Down
Loading