diff --git a/sdks/python/src/opik/runner/bridge_handlers/common.py b/sdks/python/src/opik/runner/bridge_handlers/common.py index 9c5fdb2deb0..29f8ed70d82 100644 --- a/sdks/python/src/opik/runner/bridge_handlers/common.py +++ b/sdks/python/src/opik/runner/bridge_handlers/common.py @@ -5,7 +5,7 @@ import subprocess import threading from pathlib import Path -from typing import Optional, Set, Tuple +from typing import Set, Tuple from . import CommandError @@ -90,9 +90,39 @@ def resolve_text_file(path_str: str, repo_root: Path) -> Tuple[Path, str]: return path, raw -def git_ls_files(repo_root: Path) -> Optional[Set[str]]: - """Return all git-visible files (tracked + untracked non-ignored) as relative - paths. Returns None if git is unavailable or the directory isn't a repo.""" +def check_git_repo(repo_root: Path) -> None: + """Verify repo_root is inside a git repository. + + Raises CommandError with code 'not_a_git_repository' or 'git_not_available'. + """ + try: + result = subprocess.run( + ["git", "rev-parse", "--git-dir"], + cwd=str(repo_root), + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + timeout=5.0, + ) + if result.returncode != 0: + raise CommandError( + "not_a_git_repository", + f"Operation requires a git repository, but {repo_root} is not one", + ) + except subprocess.TimeoutExpired: + raise CommandError("git_not_available", "git command timed out") + except FileNotFoundError: + raise CommandError( + "git_not_available", + "You must have git installed to use Opik Connect", + ) + + +def git_ls_files(repo_root: Path) -> Set[str]: + """Return all git-visible files (tracked + untracked non-ignored) as relative paths. + + Raises CommandError with code 'git_not_available' on timeout/missing git, + or 'search_failed' on unexpected git errors. + """ try: tracked = subprocess.run( ["git", "ls-files"], @@ -102,7 +132,10 @@ def git_ls_files(repo_root: Path) -> Optional[Set[str]]: timeout=10, ) if tracked.returncode != 0: - return None + raise CommandError( + "search_failed", + f"git ls-files failed: {tracked.stderr.strip()}", + ) untracked = subprocess.run( ["git", "ls-files", "--others", "--exclude-standard"], @@ -112,7 +145,7 @@ def git_ls_files(repo_root: Path) -> Optional[Set[str]]: timeout=10, ) - files = set() + files: Set[str] = set() for line in tracked.stdout.splitlines(): if line.strip(): files.add(line.strip()) @@ -120,8 +153,13 @@ def git_ls_files(repo_root: Path) -> Optional[Set[str]]: if line.strip(): files.add(line.strip()) return files - except (subprocess.TimeoutExpired, FileNotFoundError): - return None + except subprocess.TimeoutExpired: + raise CommandError("git_not_available", "git command timed out") + except FileNotFoundError: + raise CommandError( + "git_not_available", + "You must have git installed to use Opik Connect", + ) def backoff_wait( diff --git a/sdks/python/src/opik/runner/bridge_handlers/list_files.py b/sdks/python/src/opik/runner/bridge_handlers/list_files.py index bdb1fa6a554..55679986e36 100644 --- a/sdks/python/src/opik/runner/bridge_handlers/list_files.py +++ b/sdks/python/src/opik/runner/bridge_handlers/list_files.py @@ -1,8 +1,7 @@ """list_files bridge command handler.""" -import os from pathlib import Path, PurePosixPath -from typing import Any, Dict, List, Set, Tuple +from typing import Any, Dict, List from pydantic import BaseModel @@ -39,10 +38,8 @@ def execute(self, args: Dict[str, Any], timeout: float) -> Dict[str, Any]: if not base.is_dir(): raise CommandError("file_not_found", f"Directory not found: {sub_path}") - walk_truncated = False + common.check_git_repo(self._repo_root) all_files = common.git_ls_files(self._repo_root) - if all_files is None: - all_files, walk_truncated = _walk_files(self._repo_root) try: base_rel = str(base.relative_to(self._repo_root)) @@ -62,7 +59,7 @@ def execute(self, args: Dict[str, Any], timeout: float) -> Dict[str, Any]: matches: List[str] = [] total = len(filtered) byte_count = 0 - truncated = walk_truncated + truncated = False for rel in filtered: entry_bytes = len(rel.encode("utf-8")) + 1 @@ -88,36 +85,6 @@ def _matches_pattern(rel: str, pattern: str) -> bool: return False -_WALK_MAX_FILES = 10_000 - - -def _walk_files(repo_root: Path) -> Tuple[Set[str], bool]: - files: Set[str] = set() - for dirpath, dirnames, filenames in os.walk(repo_root): - dirnames[:] = [ - d - for d in dirnames - if not d.startswith(".") and d not in common.WALK_SKIP_DIRS - ] - for fname in filenames: - if fname.startswith("."): - continue - full = Path(dirpath) / fname - if full.is_symlink(): - try: - full.resolve().relative_to(repo_root.resolve()) - except ValueError: - continue - try: - rel = str(full.relative_to(repo_root)) - files.add(rel) - except ValueError: - continue - if len(files) >= _WALK_MAX_FILES: - return files, True - return files, False - - def _safe_mtime(path: Path) -> float: try: return path.stat().st_mtime diff --git a/sdks/python/src/opik/runner/bridge_handlers/search_files.py b/sdks/python/src/opik/runner/bridge_handlers/search_files.py index 3433aa7ecfc..30bf949a6a6 100644 --- a/sdks/python/src/opik/runner/bridge_handlers/search_files.py +++ b/sdks/python/src/opik/runner/bridge_handlers/search_files.py @@ -39,6 +39,8 @@ def execute(self, args: Dict[str, Any], timeout: float) -> Dict[str, Any]: f"Pattern too long (max {_MAX_PATTERN_LENGTH} chars)", ) + common.check_git_repo(self._repo_root) + glob_filter = parsed.glob sub_path = parsed.path @@ -50,6 +52,7 @@ def execute(self, args: Dict[str, Any], timeout: float) -> Dict[str, Any]: cmd = [ "git", "grep", + "--untracked", "-n", f"-C{_CONTEXT_LINES}", "--no-color", @@ -57,6 +60,8 @@ def execute(self, args: Dict[str, Any], timeout: float) -> Dict[str, Any]: parsed.pattern, ] + # git grep doesn't support combining glob patterns and path arguments. + # If both are provided, glob takes precedence. if glob_filter: cmd.extend(["--", glob_filter]) elif sub_path: @@ -72,13 +77,12 @@ def execute(self, args: Dict[str, Any], timeout: float) -> Dict[str, Any]: ) except subprocess.TimeoutExpired: raise CommandError("timeout", "Search timed out") - except FileNotFoundError: - raise CommandError("internal", "git not available") if result.returncode not in (0, 1): stderr = result.stderr.strip() - if "invalid" in stderr.lower() or "error" in stderr.lower(): + if "invalid" in stderr.lower(): raise CommandError("match_not_found", f"Invalid pattern: {stderr}") + raise CommandError("search_failed", f"git grep failed: {stderr}") matches: List[Dict[str, Any]] = [] total_matches = 0 diff --git a/sdks/python/src/opik/runner/snapshot.py b/sdks/python/src/opik/runner/snapshot.py index f6f78b13c8e..dab17df37d8 100644 --- a/sdks/python/src/opik/runner/snapshot.py +++ b/sdks/python/src/opik/runner/snapshot.py @@ -7,7 +7,7 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Set -from .bridge_handlers import common +from .bridge_handlers import CommandError, common LOGGER = logging.getLogger(__name__) @@ -42,7 +42,11 @@ def build_checklist(repo_root: Path, command: Optional[List[str]]) -> Dict[str, Any]: - git_files = common.git_ls_files(repo_root) + try: + git_files: Optional[Set[str]] = common.git_ls_files(repo_root) + except CommandError: + # Not a git repo or git unavailable — fall back to os.walk + git_files = None file_tree = _build_file_tree(repo_root, git_files) matches = _find_instrumentation(repo_root, git_files) diff --git a/sdks/python/tests/unit/runner/bridge_handlers/test_list_files.py b/sdks/python/tests/unit/runner/bridge_handlers/test_list_files.py index 16c89cb7970..f09cd8d3b92 100644 --- a/sdks/python/tests/unit/runner/bridge_handlers/test_list_files.py +++ b/sdks/python/tests/unit/runner/bridge_handlers/test_list_files.py @@ -1,6 +1,7 @@ import subprocess import time from pathlib import Path +from unittest.mock import MagicMock, patch import pytest @@ -110,74 +111,22 @@ def test_list_files__no_matches__returns_empty(self, tmp_path: Path) -> None: assert result["total"] == 0 assert result["truncated"] is False - -class TestListFilesNonGit: - """Tests for ListFiles fallback when no git repo is present.""" - - def _handler(self, tmp_path: Path) -> ListFilesHandler: - return ListFilesHandler(tmp_path) - - def test_list_files__no_git__finds_files(self, tmp_path: Path) -> None: - (tmp_path / "a.py").write_text("x") - (tmp_path / "sub").mkdir() - (tmp_path / "sub" / "b.py").write_text("x") - handler = self._handler(tmp_path) - result = handler.execute({"pattern": "**/*.py"}, timeout=30.0) - assert "a.py" in result["files"] - assert any("b.py" in f for f in result["files"]) - - def test_list_files__no_git__skips_hidden_dirs(self, tmp_path: Path) -> None: - (tmp_path / ".hidden").mkdir() - (tmp_path / ".hidden" / "secret.py").write_text("x") - (tmp_path / "visible.py").write_text("x") - handler = self._handler(tmp_path) - result = handler.execute({"pattern": "**/*.py"}, timeout=30.0) - assert "visible.py" in result["files"] - assert not any("secret.py" in f for f in result["files"]) - - def test_list_files__no_git__skips_hidden_files(self, tmp_path: Path) -> None: - (tmp_path / ".env").write_text("SECRET=x") - (tmp_path / "app.py").write_text("x") - handler = self._handler(tmp_path) - result = handler.execute({}, timeout=30.0) - assert not any(".env" in f for f in result["files"]) - assert "app.py" in result["files"] - - def test_list_files__no_git__skips_junk_dirs(self, tmp_path: Path) -> None: - (tmp_path / "node_modules").mkdir() - (tmp_path / "node_modules" / "pkg.js").write_text("x") - (tmp_path / "__pycache__").mkdir() - (tmp_path / "__pycache__" / "mod.pyc").write_text("x") - (tmp_path / "app.py").write_text("x") + @patch("subprocess.run") + def test_list_files__not_a_git_repo__raises_error( + self, mock_run: MagicMock, tmp_path: Path + ) -> None: + mock_run.return_value = MagicMock(returncode=1) handler = self._handler(tmp_path) - result = handler.execute({}, timeout=30.0) - assert "app.py" in result["files"] - assert not any("node_modules" in f for f in result["files"]) - assert not any("__pycache__" in f for f in result["files"]) + with pytest.raises(CommandError) as exc_info: + handler.execute({"pattern": "*.py"}, timeout=30.0) + assert exc_info.value.code == "not_a_git_repository" - def test_list_files__no_git__symlink_outside_root_excluded( - self, tmp_path: Path + @patch("subprocess.run") + def test_list_files__git_not_available__raises_error( + self, mock_run: MagicMock, tmp_path: Path ) -> None: - outside = tmp_path / "outside" - outside.mkdir() - secret = outside / "secret.txt" - secret.write_text("sensitive") - - repo = tmp_path / "repo" - repo.mkdir() - (repo / "legit.py").write_text("x") - (repo / "link.txt").symlink_to(secret) - - handler = ListFilesHandler(repo) - result = handler.execute({}, timeout=30.0) - assert "legit.py" in result["files"] - assert not any("link.txt" in f for f in result["files"]) - - def test_list_files__no_git__caps_at_max_files(self, tmp_path: Path) -> None: - from opik.runner.bridge_handlers.list_files import _WALK_MAX_FILES - - for i in range(_WALK_MAX_FILES + 100): - (tmp_path / f"file_{i:06d}.txt").write_text("x") + mock_run.side_effect = FileNotFoundError("git not found") handler = self._handler(tmp_path) - result = handler.execute({}, timeout=30.0) - assert result["total"] <= _WALK_MAX_FILES + with pytest.raises(CommandError) as exc_info: + handler.execute({"pattern": "*.py"}, timeout=30.0) + assert exc_info.value.code == "git_not_available" diff --git a/sdks/python/tests/unit/runner/bridge_handlers/test_search_files.py b/sdks/python/tests/unit/runner/bridge_handlers/test_search_files.py index 9c5ca409251..0ac4ca3d8b3 100644 --- a/sdks/python/tests/unit/runner/bridge_handlers/test_search_files.py +++ b/sdks/python/tests/unit/runner/bridge_handlers/test_search_files.py @@ -1,11 +1,14 @@ import subprocess from pathlib import Path +from unittest.mock import MagicMock, patch import pytest from opik.runner.bridge_handlers import CommandError from opik.runner.bridge_handlers.search_files import SearchFilesHandler +_TEST_TIMEOUT = 1.0 + def _git_init(tmp_path: Path) -> None: subprocess.run(["git", "init"], cwd=str(tmp_path), capture_output=True) @@ -47,7 +50,7 @@ def _setup_files(self, tmp_path: Path) -> None: def test_search_files__regex_pattern__finds_matches(self, tmp_path: Path) -> None: self._setup_files(tmp_path) handler = self._handler(tmp_path) - result = handler.execute({"pattern": r"def \w+"}, timeout=30.0) + result = handler.execute({"pattern": r"def \w+"}, timeout=_TEST_TIMEOUT) assert result["total_matches"] >= 2 assert any(m["file"] == "app.py" for m in result["matches"]) @@ -56,7 +59,7 @@ def test_search_files__match_found__includes_context_lines( ) -> None: self._setup_files(tmp_path) handler = self._handler(tmp_path) - result = handler.execute({"pattern": "return"}, timeout=30.0) + result = handler.execute({"pattern": "return"}, timeout=_TEST_TIMEOUT) match = result["matches"][0] assert "context_before" in match assert "context_after" in match @@ -69,7 +72,9 @@ def test_search_files__glob_filter__restricts_to_matching_files( (tmp_path / "b.txt").write_text("target\n") _git_add_commit(tmp_path) handler = self._handler(tmp_path) - result = handler.execute({"pattern": "target", "glob": "*.py"}, timeout=30.0) + result = handler.execute( + {"pattern": "target", "glob": "*.py"}, timeout=_TEST_TIMEOUT + ) files = [m["file"] for m in result["matches"]] assert "a.py" in files assert "b.txt" not in files @@ -79,7 +84,9 @@ def test_search_files__subdir_scope__searches_only_subdir( ) -> None: self._setup_files(tmp_path) handler = self._handler(tmp_path) - result = handler.execute({"pattern": "def", "path": "src"}, timeout=30.0) + result = handler.execute( + {"pattern": "def", "path": "src"}, timeout=_TEST_TIMEOUT + ) files = [m["file"] for m in result["matches"]] assert all("src/" in f for f in files) @@ -87,7 +94,7 @@ def test_search_files__path_traversal__raises_error(self, tmp_path: Path) -> Non _git_init(tmp_path) handler = self._handler(tmp_path) with pytest.raises(CommandError) as exc_info: - handler.execute({"pattern": "x", "path": "../../"}, timeout=30.0) + handler.execute({"pattern": "x", "path": "../../"}, timeout=_TEST_TIMEOUT) assert exc_info.value.code == "path_traversal" def test_search_files__no_matches__returns_empty(self, tmp_path: Path) -> None: @@ -95,12 +102,77 @@ def test_search_files__no_matches__returns_empty(self, tmp_path: Path) -> None: (tmp_path / "file.py").write_text("nothing here\n") _git_add_commit(tmp_path) handler = self._handler(tmp_path) - result = handler.execute({"pattern": "ZZZZZ"}, timeout=30.0) + result = handler.execute({"pattern": "ZZZZZ"}, timeout=_TEST_TIMEOUT) assert result["matches"] == [] assert result["total_matches"] == 0 def test_search_files__empty_pattern__raises_error(self, tmp_path: Path) -> None: handler = self._handler(tmp_path) with pytest.raises(CommandError) as exc_info: - handler.execute({"pattern": ""}, timeout=30.0) + handler.execute({"pattern": ""}, timeout=_TEST_TIMEOUT) assert exc_info.value.code == "match_not_found" + + def test_search_files__not_a_git_repo__raises_error(self, tmp_path: Path) -> None: + (tmp_path / "file.py").write_text("hello\n") + handler = self._handler(tmp_path) + with pytest.raises(CommandError) as exc_info: + handler.execute({"pattern": "hello"}, timeout=_TEST_TIMEOUT) + assert exc_info.value.code == "not_a_git_repository" + + def test_search_files__untracked_file__is_searched(self, tmp_path: Path) -> None: + _git_init(tmp_path) + _git_add_commit(tmp_path) + # Write after commit so the file is untracked + (tmp_path / "untracked.py").write_text("def untracked_func():\n pass\n") + handler = self._handler(tmp_path) + result = handler.execute({"pattern": r"def \w+"}, timeout=_TEST_TIMEOUT) + files = [m["file"] for m in result["matches"]] + assert "untracked.py" in files + + @patch("subprocess.run") + def test_search_files__uses_untracked_flag( + self, mock_run: MagicMock, tmp_path: Path + ) -> None: + def side_effect(*args, **kwargs): + result = MagicMock() + result.returncode = 0 + result.stdout = "" + result.stderr = "" + return result + + mock_run.side_effect = side_effect + + handler = self._handler(tmp_path) + handler.execute({"pattern": "target"}, timeout=_TEST_TIMEOUT) + + grep_call = [call for call in mock_run.call_args_list if "grep" in call[0][0]][ + 0 + ] + assert "--untracked" in grep_call[0][0] + + @patch("subprocess.run") + def test_search_files__glob_takes_precedence_over_path( + self, mock_run: MagicMock, tmp_path: Path + ) -> None: + (tmp_path / "src").mkdir() + + def side_effect(*args, **kwargs): + result = MagicMock() + result.returncode = 0 + result.stdout = "" + result.stderr = "" + return result + + mock_run.side_effect = side_effect + + handler = self._handler(tmp_path) + handler.execute( + {"pattern": "target", "glob": "*.txt", "path": "src"}, timeout=_TEST_TIMEOUT + ) + + grep_call = [call for call in mock_run.call_args_list if "grep" in call[0][0]][ + 0 + ] + cmd = grep_call[0][0] + assert "*.txt" in cmd + assert "src" not in cmd[cmd.index("--") + 1 :]