Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 46 additions & 8 deletions sdks/python/src/opik/runner/bridge_handlers/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import subprocess
import threading
from pathlib import Path
from typing import Optional, Set, Tuple
from typing import Set, Tuple

from . import CommandError

Expand Down Expand Up @@ -90,9 +90,39 @@ def resolve_text_file(path_str: str, repo_root: Path) -> Tuple[Path, str]:
return path, raw


def git_ls_files(repo_root: Path) -> Optional[Set[str]]:
"""Return all git-visible files (tracked + untracked non-ignored) as relative
paths. Returns None if git is unavailable or the directory isn't a repo."""
def check_git_repo(repo_root: Path) -> None:
"""Verify repo_root is inside a git repository.

Raises CommandError with code 'not_a_git_repository' or 'git_not_available'.
"""
try:
result = subprocess.run(
["git", "rev-parse", "--git-dir"],
cwd=str(repo_root),
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=5.0,
Comment thread
petrotiurin marked this conversation as resolved.
)
if result.returncode != 0:
raise CommandError(
"not_a_git_repository",
f"Operation requires a git repository, but {repo_root} is not one",
)
except subprocess.TimeoutExpired:
raise CommandError("git_not_available", "git command timed out")
except FileNotFoundError:
raise CommandError(
"git_not_available",
"You must have git installed to use Opik Connect",
Comment thread
petrotiurin marked this conversation as resolved.
)


def git_ls_files(repo_root: Path) -> Set[str]:
"""Return all git-visible files (tracked + untracked non-ignored) as relative paths.

Raises CommandError with code 'git_not_available' on timeout/missing git,
or 'search_failed' on unexpected git errors.
"""
try:
tracked = subprocess.run(
["git", "ls-files"],
Expand All @@ -102,7 +132,10 @@ def git_ls_files(repo_root: Path) -> Optional[Set[str]]:
timeout=10,
)
if tracked.returncode != 0:
return None
raise CommandError(
"search_failed",
f"git ls-files failed: {tracked.stderr.strip()}",
)

untracked = subprocess.run(
["git", "ls-files", "--others", "--exclude-standard"],
Comment thread
petrotiurin marked this conversation as resolved.
Expand All @@ -112,16 +145,21 @@ def git_ls_files(repo_root: Path) -> Optional[Set[str]]:
timeout=10,
)

files = set()
files: Set[str] = set()
for line in tracked.stdout.splitlines():
if line.strip():
files.add(line.strip())
for line in untracked.stdout.splitlines():
if line.strip():
files.add(line.strip())
return files
except (subprocess.TimeoutExpired, FileNotFoundError):
return None
except subprocess.TimeoutExpired:
raise CommandError("git_not_available", "git command timed out")
except FileNotFoundError:
raise CommandError(
"git_not_available",
"You must have git installed to use Opik Connect",
)


def backoff_wait(
Expand Down
39 changes: 3 additions & 36 deletions sdks/python/src/opik/runner/bridge_handlers/list_files.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
"""list_files bridge command handler."""

import os
from pathlib import Path, PurePosixPath
from typing import Any, Dict, List, Set, Tuple
from typing import Any, Dict, List

from pydantic import BaseModel

Expand Down Expand Up @@ -39,10 +38,8 @@ def execute(self, args: Dict[str, Any], timeout: float) -> Dict[str, Any]:
if not base.is_dir():
raise CommandError("file_not_found", f"Directory not found: {sub_path}")

walk_truncated = False
common.check_git_repo(self._repo_root)
all_files = common.git_ls_files(self._repo_root)
if all_files is None:
all_files, walk_truncated = _walk_files(self._repo_root)

try:
base_rel = str(base.relative_to(self._repo_root))
Expand All @@ -62,7 +59,7 @@ def execute(self, args: Dict[str, Any], timeout: float) -> Dict[str, Any]:
matches: List[str] = []
total = len(filtered)
byte_count = 0
truncated = walk_truncated
truncated = False
Comment thread
collincunn marked this conversation as resolved.

for rel in filtered:
entry_bytes = len(rel.encode("utf-8")) + 1
Expand All @@ -88,36 +85,6 @@ def _matches_pattern(rel: str, pattern: str) -> bool:
return False


_WALK_MAX_FILES = 10_000


def _walk_files(repo_root: Path) -> Tuple[Set[str], bool]:
files: Set[str] = set()
for dirpath, dirnames, filenames in os.walk(repo_root):
dirnames[:] = [
d
for d in dirnames
if not d.startswith(".") and d not in common.WALK_SKIP_DIRS
]
for fname in filenames:
if fname.startswith("."):
continue
full = Path(dirpath) / fname
if full.is_symlink():
try:
full.resolve().relative_to(repo_root.resolve())
except ValueError:
continue
try:
rel = str(full.relative_to(repo_root))
files.add(rel)
except ValueError:
continue
if len(files) >= _WALK_MAX_FILES:
return files, True
return files, False


def _safe_mtime(path: Path) -> float:
try:
return path.stat().st_mtime
Expand Down
10 changes: 7 additions & 3 deletions sdks/python/src/opik/runner/bridge_handlers/search_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ def execute(self, args: Dict[str, Any], timeout: float) -> Dict[str, Any]:
f"Pattern too long (max {_MAX_PATTERN_LENGTH} chars)",
)

common.check_git_repo(self._repo_root)

glob_filter = parsed.glob
sub_path = parsed.path

Expand All @@ -50,13 +52,16 @@ def execute(self, args: Dict[str, Any], timeout: float) -> Dict[str, Any]:
cmd = [
"git",
"grep",
"--untracked",
"-n",
f"-C{_CONTEXT_LINES}",
"--no-color",
"-P",
parsed.pattern,
]

# git grep doesn't support combining glob patterns and path arguments.
# If both are provided, glob takes precedence.
if glob_filter:
cmd.extend(["--", glob_filter])
elif sub_path:
Expand All @@ -72,13 +77,12 @@ def execute(self, args: Dict[str, Any], timeout: float) -> Dict[str, Any]:
)
except subprocess.TimeoutExpired:
raise CommandError("timeout", "Search timed out")
except FileNotFoundError:
raise CommandError("internal", "git not available")

if result.returncode not in (0, 1):
stderr = result.stderr.strip()
if "invalid" in stderr.lower() or "error" in stderr.lower():
if "invalid" in stderr.lower():
raise CommandError("match_not_found", f"Invalid pattern: {stderr}")
raise CommandError("search_failed", f"git grep failed: {stderr}")

matches: List[Dict[str, Any]] = []
total_matches = 0
Expand Down
8 changes: 6 additions & 2 deletions sdks/python/src/opik/runner/snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pathlib import Path
from typing import Any, Dict, List, Optional, Set

from .bridge_handlers import common
from .bridge_handlers import CommandError, common

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -42,7 +42,11 @@


def build_checklist(repo_root: Path, command: Optional[List[str]]) -> Dict[str, Any]:
git_files = common.git_ls_files(repo_root)
try:
git_files: Optional[Set[str]] = common.git_ls_files(repo_root)
except CommandError:
# Not a git repo or git unavailable — fall back to os.walk
git_files = None
Comment thread
petrotiurin marked this conversation as resolved.
file_tree = _build_file_tree(repo_root, git_files)
matches = _find_instrumentation(repo_root, git_files)

Expand Down
83 changes: 16 additions & 67 deletions sdks/python/tests/unit/runner/bridge_handlers/test_list_files.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import subprocess
import time
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest

Expand Down Expand Up @@ -110,74 +111,22 @@ def test_list_files__no_matches__returns_empty(self, tmp_path: Path) -> None:
assert result["total"] == 0
assert result["truncated"] is False


class TestListFilesNonGit:
"""Tests for ListFiles fallback when no git repo is present."""

def _handler(self, tmp_path: Path) -> ListFilesHandler:
return ListFilesHandler(tmp_path)

def test_list_files__no_git__finds_files(self, tmp_path: Path) -> None:
(tmp_path / "a.py").write_text("x")
(tmp_path / "sub").mkdir()
(tmp_path / "sub" / "b.py").write_text("x")
handler = self._handler(tmp_path)
result = handler.execute({"pattern": "**/*.py"}, timeout=30.0)
assert "a.py" in result["files"]
assert any("b.py" in f for f in result["files"])

def test_list_files__no_git__skips_hidden_dirs(self, tmp_path: Path) -> None:
(tmp_path / ".hidden").mkdir()
(tmp_path / ".hidden" / "secret.py").write_text("x")
(tmp_path / "visible.py").write_text("x")
handler = self._handler(tmp_path)
result = handler.execute({"pattern": "**/*.py"}, timeout=30.0)
assert "visible.py" in result["files"]
assert not any("secret.py" in f for f in result["files"])

def test_list_files__no_git__skips_hidden_files(self, tmp_path: Path) -> None:
(tmp_path / ".env").write_text("SECRET=x")
(tmp_path / "app.py").write_text("x")
handler = self._handler(tmp_path)
result = handler.execute({}, timeout=30.0)
assert not any(".env" in f for f in result["files"])
assert "app.py" in result["files"]

def test_list_files__no_git__skips_junk_dirs(self, tmp_path: Path) -> None:
(tmp_path / "node_modules").mkdir()
(tmp_path / "node_modules" / "pkg.js").write_text("x")
(tmp_path / "__pycache__").mkdir()
(tmp_path / "__pycache__" / "mod.pyc").write_text("x")
(tmp_path / "app.py").write_text("x")
@patch("subprocess.run")
def test_list_files__not_a_git_repo__raises_error(
self, mock_run: MagicMock, tmp_path: Path
) -> None:
mock_run.return_value = MagicMock(returncode=1)
handler = self._handler(tmp_path)
result = handler.execute({}, timeout=30.0)
assert "app.py" in result["files"]
assert not any("node_modules" in f for f in result["files"])
assert not any("__pycache__" in f for f in result["files"])
with pytest.raises(CommandError) as exc_info:
handler.execute({"pattern": "*.py"}, timeout=30.0)
assert exc_info.value.code == "not_a_git_repository"

def test_list_files__no_git__symlink_outside_root_excluded(
self, tmp_path: Path
@patch("subprocess.run")
def test_list_files__git_not_available__raises_error(
self, mock_run: MagicMock, tmp_path: Path
) -> None:
outside = tmp_path / "outside"
outside.mkdir()
secret = outside / "secret.txt"
secret.write_text("sensitive")

repo = tmp_path / "repo"
repo.mkdir()
(repo / "legit.py").write_text("x")
(repo / "link.txt").symlink_to(secret)

handler = ListFilesHandler(repo)
result = handler.execute({}, timeout=30.0)
assert "legit.py" in result["files"]
assert not any("link.txt" in f for f in result["files"])

def test_list_files__no_git__caps_at_max_files(self, tmp_path: Path) -> None:
from opik.runner.bridge_handlers.list_files import _WALK_MAX_FILES

for i in range(_WALK_MAX_FILES + 100):
(tmp_path / f"file_{i:06d}.txt").write_text("x")
mock_run.side_effect = FileNotFoundError("git not found")
handler = self._handler(tmp_path)
result = handler.execute({}, timeout=30.0)
assert result["total"] <= _WALK_MAX_FILES
with pytest.raises(CommandError) as exc_info:
handler.execute({"pattern": "*.py"}, timeout=30.0)
assert exc_info.value.code == "git_not_available"
Loading
Loading