diff --git a/pyproject.toml b/pyproject.toml index 1094621..e8d7b2b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "gpt_copy" -version = "2.3.0" +version = "2.4.0" description = "A script to concatenate files into a single structured stream." readme = "readme.md" requires-python = ">=3.10" diff --git a/readme.md b/readme.md index 95875a0..d818192 100644 --- a/readme.md +++ b/readme.md @@ -10,6 +10,7 @@ GPT Copy is a command-line tool that recursively scans a directory, collects rea - **Force Mode:** The `-f/--force` option bypasses ignore rules and Git-tracked file restrictions. - **Line Numbering:** Zero-padded line numbers are added to each file's content by default (similar to `cat -n`). Use `--no-number` to disable. - **Token Counting:** Includes a separate `tokens` CLI command to count the number of tokens in text using OpenAI’s `tiktoken` library with GPT-4o model encoding. +- **Integrated Token Analysis:** Use `--tokens` to display token counts for each file in the tree structure, with `--top-n` to filter and show only the files with the most tokens. ## Installation @@ -114,6 +115,27 @@ Count the number of tokens in a given text using GPT-4o encoding. The command re gpt-copy /path/to/directory | tokens ``` +### Display Token Counts in Tree Structure +Display token counts for each file in the directory tree using the `--tokens` option: + +```sh +gpt-copy /path/to/directory --tokens +``` + +**Filter by Top N Files by Token Count:** +Show only the files with the highest token counts: + +```sh +gpt-copy /path/to/directory --tokens --top-n 5 +``` + +**Combine with File Filtering:** +Use with include/exclude patterns to count tokens only for specific file types: + +```sh +gpt-copy /path/to/directory --tokens --include "*.py" --top-n 3 +``` + ## How It Works 1. **Collects `.gitignore` Rules:** Scans the directory for `.gitignore` files and applies the rules to skip ignored files unless the force mode is enabled. diff --git a/src/gpt_copy/gpt_copy.py b/src/gpt_copy/gpt_copy.py index 50c4aa5..98f252c 100755 --- a/src/gpt_copy/gpt_copy.py +++ b/src/gpt_copy/gpt_copy.py @@ -12,6 +12,40 @@ from tqdm import tqdm from gpt_copy.filter import should_include_file, matches_any_pattern +from dataclasses import dataclass +from typing import List + + +@dataclass +class FileInfo: + """Information about a file including path and token count.""" + path: Path + relative_path: str + token_count: int + is_directory: bool = False + + +def count_tokens_safe(text: str) -> int: + """ + Count tokens using tiktoken if available, otherwise use a simple estimation. + + Args: + text (str): The text to count tokens for. + + Returns: + int: The estimated number of tokens. + """ + try: + import tiktoken + enc = tiktoken.encoding_for_model("gpt-4o") + tokens = enc.encode(text) + return len(tokens) + except Exception: + # Fallback to simple estimation if tiktoken fails + # Rough approximation: ~4 characters per token for English text + char_count = len(text) + estimated_tokens = max(1, char_count // 4) + return estimated_tokens def add_line_numbers(text: str) -> str: @@ -315,6 +349,201 @@ def is_ignored( return False +def collect_file_info_with_tokens( + root_path: Path, + gitignore_specs: dict[str, PathSpec], + tracked_files: set[str] | None, + include_patterns: list[str] | None = None, + exclude_patterns: list[str] | None = None, +) -> List[FileInfo]: + """ + Collect file information including token counts for all files. + + Args: + root_path (Path): The root path to start collecting files. + gitignore_specs (Dict[str, PathSpec]): The gitignore specifications. + tracked_files (Optional[Set[str]]): The set of tracked files. + include_patterns (Optional[List[str]]): The list of include glob patterns. + exclude_patterns (Optional[List[str]]): The list of exclude glob patterns. + + Returns: + List[FileInfo]: List of FileInfo objects with token counts. + """ + print("Collecting file information with token counts...", file=sys.stderr) + file_infos: List[FileInfo] = [] + + include_patterns = include_patterns or [] + exclude_patterns = exclude_patterns or [] + + for dirpath, dirnames, filenames in os.walk(root_path): + current_dir = Path(dirpath) + + # Process files in current directory + for filename in filenames: + full_file_path = current_dir / filename + + if is_ignored(full_file_path, gitignore_specs, root_path, tracked_files): + continue + + rel_path = full_file_path.relative_to(root_path).as_posix() + + # Apply include/exclude filters + if not should_include_file(rel_path, include_patterns, exclude_patterns): + continue + + if is_binary_file(full_file_path): + # Still include binary files but with 0 tokens + file_infos.append(FileInfo( + path=full_file_path, + relative_path=rel_path, + token_count=0, + is_directory=False + )) + continue + + # Count tokens for text files + try: + with full_file_path.open("r", encoding="utf-8", errors="replace") as f: + content = f.read() + token_count = count_tokens_safe(content) + except Exception as e: + print(f"Warning: Could not read {rel_path} for token counting: {e}", file=sys.stderr) + token_count = 0 + + file_infos.append(FileInfo( + path=full_file_path, + relative_path=rel_path, + token_count=token_count, + is_directory=False + )) + + return file_infos + + +def calculate_directory_tokens(dir_structure, path_parts=()): + """ + Calculate total tokens for directories recursively. + + Args: + dir_structure: Dictionary representing directory structure + path_parts: Tuple of path parts for current directory + + Returns: + int: Total token count for the directory + """ + total_tokens = 0 + for name, item in dir_structure.items(): + if isinstance(item, dict): + # Recursively calculate tokens for subdirectory + total_tokens += calculate_directory_tokens(item, path_parts + (name,)) + else: + # File - add its token count + total_tokens += item.token_count + return total_tokens + + +def generate_tree_with_tokens( + root_path: Path, + file_infos: List[FileInfo], + gitignore_specs: dict[str, PathSpec], + tracked_files: set[str] | None = None, + exclude_patterns: list[str] | None = None, + top_n: int | None = None, +) -> str: + """ + Generate a folder structure tree with token counts. + + Args: + root_path (Path): The root path to start generating the tree. + file_infos (List[FileInfo]): List of file information with token counts. + gitignore_specs (Dict[str, PathSpec]): The gitignore specifications. + tracked_files (Optional[Set[str]]): The set of tracked files. + exclude_patterns (Optional[List[str]]): Glob patterns to exclude files/directories. + top_n (Optional[int]): Show only top N files by token count. + + Returns: + str: The generated folder structure tree with token counts. + """ + print("Generating folder structure tree with token counts...", file=sys.stderr) + + # If top_n is specified, get the top N files but keep tree structure + top_files_set = None + if top_n is not None: + top_files = sorted(file_infos, key=lambda x: x.token_count, reverse=True)[:top_n] + top_files_set = {f.relative_path for f in top_files} + + # Create a mapping of directories to their files + dir_structure = {} + for file_info in file_infos: + # If top_n is specified, only include files in the top N + if top_files_set is not None and file_info.relative_path not in top_files_set: + continue + + parts = Path(file_info.relative_path).parts + current = dir_structure + + # Build directory structure + for i, part in enumerate(parts[:-1]): + if part not in current: + current[part] = {} + current = current[part] + + # Add file to its directory + if len(parts) > 0: + filename = parts[-1] + current[filename] = file_info + + # Calculate directory token counts + root_tokens = calculate_directory_tokens(dir_structure) + + # Build a tree structure showing token counts + tree_lines = [f"{root_path.name or str(root_path)} (directory)"] + if root_tokens > 0: + tree_lines[0] = f"{root_path.name or str(root_path)} ({root_tokens} tokens)" + + def _add_tree_items(items, prefix="", is_last_at_level=True): + """Recursively add tree items with token counts.""" + # Separate directories and files, then sort each group + directories = [] + files = [] + + for name, item in items.items(): + if isinstance(item, dict): + dir_tokens = calculate_directory_tokens(item) + directories.append((name, item, dir_tokens)) + else: + files.append((name, item, item.token_count)) + + # Sort directories by token count (descending), then files by token count (descending) + directories.sort(key=lambda x: x[2], reverse=True) + files.sort(key=lambda x: x[2], reverse=True) + + # Combine directories and files - directories first, then files (both sorted by tokens) + all_items = [(name, item, tokens, True) for name, item, tokens in directories] + \ + [(name, item, tokens, False) for name, item, tokens in files] + + for idx, (name, item, tokens, is_dir) in enumerate(all_items): + is_last = idx == len(all_items) - 1 + connector = "└── " if is_last else "├── " + + if is_dir: + # Directory with token count + tree_lines.append(f"{prefix}{connector}{name}/ ({tokens} tokens)") + extension = " " if is_last else "│ " + _add_tree_items(item, prefix + extension, is_last) + else: + # File with token count + tree_lines.append(f"{prefix}{connector}{name} ({tokens} tokens)") + + _add_tree_items(dir_structure) + + if top_n is not None: + total_files = len([f for f in file_infos if not f.is_directory]) + tree_lines.append(f"\nShowing top {min(top_n, total_files)} files by token count") + + return "\n".join(tree_lines) + + def generate_tree( root_path: Path, gitignore_specs: dict[str, PathSpec], @@ -521,6 +750,18 @@ def write_output( default=False, help="Output only the folder structure tree without file contents.", ) +@click.option( + "--tokens", + is_flag=True, + default=False, + help="Display token counts for each file in the tree structure.", +) +@click.option( + "--top-n", + type=int, + default=None, + help="When used with --tokens, show only the top N files by token count.", +) def main( root_path: Path, output_file: str | None, @@ -529,6 +770,8 @@ def main( exclude_patterns: tuple[str, ...], no_line_numbers: bool, tree_only: bool, + tokens: bool, + top_n: int | None, ) -> None: """ Main function to start the script. @@ -541,30 +784,55 @@ def main( exclude_patterns (Tuple[str, ...]): The tuple of exclude glob patterns. no_line_numbers (bool): If True, disable line numbers. tree_only (bool): If True, output only the folder structure tree. + tokens (bool): If True, display token counts for each file in the tree. + top_n (Optional[int]): When used with tokens, show only top N files by token count. """ root_path = root_path.resolve() print(f"Starting script for directory: {root_path}", file=sys.stderr) gitignore_specs, tracked_files = get_ignore_settings(root_path, force) - tree_output = generate_tree( - root_path, gitignore_specs, tracked_files, list(exclude_patterns) - ) - if tree_only: - # Only output the tree structure - file_sections = [] - unrecognized_files = [] - else: - # Collect file contents as usual - file_sections, unrecognized_files = collect_files_content( + if tokens: + # Use token-aware tree generation + file_infos = collect_file_info_with_tokens( root_path, gitignore_specs, - output_file, tracked_files, include_patterns=list(include_patterns), exclude_patterns=list(exclude_patterns), - line_numbers=not no_line_numbers, ) + tree_output = generate_tree_with_tokens( + root_path, + file_infos, + gitignore_specs, + tracked_files, + list(exclude_patterns), + top_n, + ) + # When showing tokens, we don't need the file contents + file_sections = [] + unrecognized_files = [] + else: + # Use regular tree generation + tree_output = generate_tree( + root_path, gitignore_specs, tracked_files, list(exclude_patterns) + ) + + if tree_only: + # Only output the tree structure + file_sections = [] + unrecognized_files = [] + else: + # Collect file contents as usual + file_sections, unrecognized_files = collect_files_content( + root_path, + gitignore_specs, + output_file, + tracked_files, + include_patterns=list(include_patterns), + exclude_patterns=list(exclude_patterns), + line_numbers=not no_line_numbers, + ) if output_file: print(f"Writing output to {output_file}...", file=sys.stderr) diff --git a/tests/test_tokens_cli_integration.py b/tests/test_tokens_cli_integration.py new file mode 100644 index 0000000..4c6d9e6 --- /dev/null +++ b/tests/test_tokens_cli_integration.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +""" +Integration tests for gpt-copy tokens functionality. +Tests the CLI interface and integration with existing functionality. +""" +import tempfile +import subprocess +import sys +from pathlib import Path +import os + + +def run_gpt_copy_command(args, cwd=None): + """Run gpt-copy command and return output.""" + # Create a wrapper script to run gpt-copy + wrapper_script = f""" +import sys +sys.path.insert(0, "{Path(__file__).parent.parent / 'src'}") +from gpt_copy.gpt_copy import main + +if __name__ == "__main__": + main() +""" + + with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f: + f.write(wrapper_script) + wrapper_path = f.name + + try: + cmd = [sys.executable, wrapper_path] + args + result = subprocess.run( + cmd, + capture_output=True, + text=True, + cwd=cwd + ) + return result.returncode, result.stdout, result.stderr + finally: + os.unlink(wrapper_path) + + +def test_tokens_option_basic(): + """Test basic --tokens functionality.""" + print("Testing basic --tokens functionality...") + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create test files + (temp_path / "file1.py").write_text("print('hello world')") + (temp_path / "file2.js").write_text("console.log('test');") + + # Run gpt-copy with --tokens + returncode, stdout, stderr = run_gpt_copy_command([str(temp_path), "--tokens"]) + + assert returncode == 0, f"Command failed with stderr: {stderr}" + assert "tokens)" in stdout, "Output should contain token counts" + assert "file1.py" in stdout, "Output should contain file1.py" + assert "file2.js" in stdout, "Output should contain file2.js" + + print("✓ Basic --tokens functionality works") + + +def test_tokens_with_top_n(): + """Test --tokens with --top-n functionality.""" + print("Testing --tokens with --top-n...") + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create test files with different lengths + (temp_path / "small.py").write_text("x=1") + (temp_path / "medium.py").write_text("print('hello world')") + (temp_path / "large.py").write_text("# This is a long comment\nprint('hello world')\n# Another comment") + + # Run gpt-copy with --tokens --top-n 2 + returncode, stdout, stderr = run_gpt_copy_command([str(temp_path), "--tokens", "--top-n", "2"]) + + assert returncode == 0, f"Command failed with stderr: {stderr}" + assert "Showing top 2 files" in stdout, "Should show top-n message" + + # The largest file should be included + assert "large.py" in stdout, "Largest file should be included" + + print("✓ --tokens with --top-n works") + + +def test_tokens_with_include_filter(): + """Test --tokens with file filtering.""" + print("Testing --tokens with include filters...") + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create test files + (temp_path / "script.py").write_text("print('python script')") + (temp_path / "app.js").write_text("console.log('javascript');") + (temp_path / "readme.txt").write_text("This is documentation") + + # Run gpt-copy with --tokens and Python filter + returncode, stdout, stderr = run_gpt_copy_command([ + str(temp_path), "--tokens", "--include", "*.py" + ]) + + assert returncode == 0, f"Command failed with stderr: {stderr}" + assert "script.py" in stdout, "Python file should be included" + assert "app.js" not in stdout, "JavaScript file should be excluded" + assert "readme.txt" not in stdout, "Text file should be excluded" + + print("✓ --tokens with include filters works") + + +def test_tokens_help_option(): + """Test that help shows the new options.""" + print("Testing help output includes new options...") + + returncode, stdout, stderr = run_gpt_copy_command(["--help"]) + + # Help should work (even without a directory argument) + assert "--tokens" in stdout, "Help should show --tokens option" + assert "--top-n" in stdout, "Help should show --top-n option" + assert "Display token counts" in stdout, "Help should describe --tokens" + assert "top N files by token count" in stdout, "Help should describe --top-n" + + print("✓ Help output includes new options") + + +def test_regular_functionality_still_works(): + """Test that existing functionality is not broken.""" + print("Testing that regular functionality still works...") + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create test file + (temp_path / "test.py").write_text("print('test')") + + # Test regular tree-only functionality + returncode, stdout, stderr = run_gpt_copy_command([str(temp_path), "--tree-only"]) + + assert returncode == 0, f"Command failed with stderr: {stderr}" + assert "test.py" in stdout, "File should appear in tree" + assert "tokens)" not in stdout, "Should not show token counts in regular mode" + + print("✓ Regular functionality still works") + + +def test_tokens_without_top_n(): + """Test that --top-n without --tokens is ignored.""" + print("Testing --top-n without --tokens is ignored...") + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create test file + (temp_path / "test.py").write_text("print('test')") + + # Use --top-n without --tokens + returncode, stdout, stderr = run_gpt_copy_command([str(temp_path), "--top-n", "1", "--tree-only"]) + + assert returncode == 0, f"Command failed with stderr: {stderr}" + assert "test.py" in stdout, "File should appear normally" + assert "tokens)" not in stdout, "Should not show token counts" + assert "Showing top" not in stdout, "Should not show top-n message" + + print("✓ --top-n without --tokens is ignored correctly") + + +def run_all_cli_tests(): + """Run all CLI tests.""" + print("Running CLI integration tests for token functionality...\n") + + try: + test_tokens_option_basic() + test_tokens_with_top_n() + test_tokens_with_include_filter() + test_tokens_help_option() + test_regular_functionality_still_works() + test_tokens_without_top_n() + + print("\n" + "="*60) + print("✅ All CLI tests passed successfully!") + print("Token functionality is properly integrated with CLI.") + + except Exception as e: + print(f"\n❌ CLI test failed with error: {e}") + import traceback + traceback.print_exc() + return False + + return True + + +if __name__ == "__main__": + run_all_cli_tests() \ No newline at end of file diff --git a/tests/test_tokens_functionality.py b/tests/test_tokens_functionality.py new file mode 100644 index 0000000..7c20d0b --- /dev/null +++ b/tests/test_tokens_functionality.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +"""Tests for token counting functionality in gpt-copy.""" + +import tempfile +import pytest +from pathlib import Path +import sys +import os + +# Add src to path for imports +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src')) + +from gpt_copy.gpt_copy import ( + count_tokens_safe, + collect_file_info_with_tokens, + generate_tree_with_tokens, + get_ignore_settings, + FileInfo +) + + +class TestTokensCounting: + """Test token counting functionality.""" + + def test_count_tokens_safe_simple(self): + """Test basic token counting.""" + # Test simple cases + assert count_tokens_safe("Hello world") > 0 + assert count_tokens_safe("") == 1 # Minimum of 1 token + + # Test that more text gives more tokens + short_text = "Hello" + long_text = "Hello world this is a much longer piece of text that should have more tokens" + assert count_tokens_safe(long_text) > count_tokens_safe(short_text) + + def test_collect_file_info_with_tokens(self): + """Test collecting file information with token counts.""" + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create test files + (temp_path / "short.py").write_text("print('hi')") + (temp_path / "long.py").write_text("# This is a much longer file with more content\nprint('hello world')\n# More comments") + (temp_path / "empty.txt").write_text("") + + # Get ignore settings + gitignore_specs, tracked_files = get_ignore_settings(temp_path, force=True) + + # Collect file info + file_infos = collect_file_info_with_tokens( + temp_path, + gitignore_specs, + tracked_files, + include_patterns=None, + exclude_patterns=None, + ) + + # Check results + assert len(file_infos) == 3 + + # Find specific files + short_file = next(f for f in file_infos if f.relative_path == "short.py") + long_file = next(f for f in file_infos if f.relative_path == "long.py") + empty_file = next(f for f in file_infos if f.relative_path == "empty.txt") + + # Verify token counts make sense + assert short_file.token_count > 0 + assert long_file.token_count > short_file.token_count + assert empty_file.token_count == 1 # Minimum 1 token + + def test_generate_tree_with_tokens(self): + """Test generating tree structure with token counts.""" + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create test files + (temp_path / "file1.py").write_text("print('test1')") + (temp_path / "file2.py").write_text("print('test2 with more content')") + + # Create file infos + file_infos = [ + FileInfo(temp_path / "file1.py", "file1.py", 3, False), + FileInfo(temp_path / "file2.py", "file2.py", 7, False), + ] + + # Generate tree + tree_output = generate_tree_with_tokens( + temp_path, + file_infos, + {}, # gitignore_specs + None, # tracked_files + None, # exclude_patterns + None, # top_n + ) + + # Check output contains token counts + assert "3 tokens" in tree_output + assert "7 tokens" in tree_output + assert "file1.py" in tree_output + assert "file2.py" in tree_output + + def test_generate_tree_with_tokens_top_n(self): + """Test generating tree with top-N filtering and correct ordering.""" + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create file infos with different token counts (in random order) + file_infos = [ + FileInfo(temp_path / "small.py", "small.py", 2, False), + FileInfo(temp_path / "medium.py", "medium.py", 5, False), + FileInfo(temp_path / "large.py", "large.py", 10, False), + FileInfo(temp_path / "subdir/huge.py", "subdir/huge.py", 15, False), + ] + + # Generate tree with top-3 + tree_output = generate_tree_with_tokens( + temp_path, + file_infos, + {}, # gitignore_specs + None, # tracked_files + None, # exclude_patterns + 3, # top_n + ) + + # Check only top 3 files are included + assert "huge.py" in tree_output # 15 tokens - should be included (in subdir) + assert "large.py" in tree_output # 10 tokens - should be included + assert "medium.py" in tree_output # 5 tokens - should be included + assert "small.py" not in tree_output # 2 tokens - should be excluded + assert "Showing top 3 files" in tree_output + + # Check that files are in correct order by token count within the tree structure + lines = tree_output.split('\n') + + # With the new tree structure, we should see: + # - subdir/ directory listed first (highest tokens: 15) + # - huge.py (15 tokens) inside subdir + # - large.py (10 tokens) at root level + # - medium.py (5 tokens) at root level + assert "subdir/ (15 tokens)" in tree_output + assert "huge.py (15 tokens)" in tree_output + assert "large.py (10 tokens)" in tree_output + assert "medium.py (5 tokens)" in tree_output + + def test_file_filtering_with_tokens(self): + """Test that file filtering works with token counting.""" + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create test files + (temp_path / "test.py").write_text("print('python')") + (temp_path / "test.js").write_text("console.log('javascript')") + (temp_path / "readme.txt").write_text("This is a readme file") + + # Get ignore settings + gitignore_specs, tracked_files = get_ignore_settings(temp_path, force=True) + + # Collect only Python files + file_infos = collect_file_info_with_tokens( + temp_path, + gitignore_specs, + tracked_files, + include_patterns=["*.py"], + exclude_patterns=None, + ) + + # Should only have the Python file + assert len(file_infos) == 1 + assert file_infos[0].relative_path == "test.py" + assert file_infos[0].token_count > 0 + + +if __name__ == "__main__": + pytest.main([__file__]) \ No newline at end of file diff --git a/uv.lock b/uv.lock index a5cd550..e7cc0bf 100644 --- a/uv.lock +++ b/uv.lock @@ -191,7 +191,7 @@ wheels = [ [[package]] name = "gpt-copy" -version = "2.3.0" +version = "2.4.0" source = { editable = "." } dependencies = [ { name = "click" },