getsentry
diff --git a/‎src/sentry/options/defaults.py‎
Lines changed: 21 additions & 0 deletions b/‎src/sentry/options/defaults.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎src/sentry/scripts/spans/add-buffer.lua‎
Lines changed: 87 additions & 5 deletions b/‎src/sentry/scripts/spans/add-buffer.lua‎
Lines changed: 87 additions & 5 deletions
diff --git a/‎src/sentry/spans/buffer.py‎
Lines changed: 109 additions & 10 deletions b/‎src/sentry/spans/buffer.py‎
Lines changed: 109 additions & 10 deletions
@@ -3299,6 +3299,27 @@
     flags=FLAG_PRIORITIZE_DISK | FLAG_AUTOMATOR_MODIFIABLE,
 )
 
+# Write payload sets to per-span distributed keys AND merged keys.
+# Flusher reads merged keys as before.
+register(
+    "spans.buffer.write-distributed-payloads",
+    default=False,
+    flags=FLAG_PRIORITIZE_DISK | FLAG_AUTOMATOR_MODIFIABLE,
+)
+# Switch flusher to read from distributed keys instead of merged.
+register(
+    "spans.buffer.read-distributed-payloads",
+    default=False,
+    flags=FLAG_PRIORITIZE_DISK | FLAG_AUTOMATOR_MODIFIABLE,
+)
+# Set to False to stop writing merged keys and skip set merges.
+# Disable after read-distributed-payloads is stable. Rollback: re-enable
+# this flag to resume merged writes before reverting read-distributed-payloads.
+register(
+    "spans.buffer.write-merged-payloads",
+    default=True,
+    flags=FLAG_PRIORITIZE_DISK | FLAG_AUTOMATOR_MODIFIABLE,
+)
 # List of trace_ids to enable debug logging for. Empty = debug off.
 # When set, logs detailed metrics about zunionstore set sizes, key existence, and trace structure.
 register(
 
@@ -29,6 +29,8 @@ ARGS:
 - max_segment_bytes -- int -- The maximum number of bytes the segment can contain.
 - byte_count -- int -- The total number of bytes in the subsegment.
 - zero_copy_dest_threshold -- int -- When > 0, use SMEMBERS+SADD instead of SUNIONSTORE when the destination set exceeds this many bytes.
+- write_distributed_payloads -- "true" or "false" -- When true, maintain member-keys tracking sets for distributed payload keys.
+- write_merged_payloads -- "true" or "false" -- When false, skip set merges and set keys expire cmds.
 - *span_id -- str[] -- The span ids in the subsegment.
 
 RETURNS:
@@ -49,7 +51,9 @@ local set_timeout = tonumber(ARGV[4])
 local max_segment_bytes = tonumber(ARGV[5])
 local byte_count = tonumber(ARGV[6])
 local zero_copy_dest_threshold = tonumber(ARGV[7])
-local NUM_ARGS = 7
+local write_distributed_payloads = ARGV[8] == "true"
+local write_merged_payloads = ARGV[9] == "true"
+local NUM_ARGS = 9
 
 local function get_time_ms()
     local time = redis.call("TIME")
@@ -102,7 +106,7 @@ local sunionstore_args = {}
 -- Updating the redirect set instead is needed when we receive higher level spans
 -- for a tree we are assembling as the segment root each span points at in the
 -- redirect set changes when a new root is found.
-if set_span_id ~= parent_span_id and redis.call("scard", parent_key) > 0 then
+if write_merged_payloads and set_span_id ~= parent_span_id and redis.call("scard", parent_key) > 0 then
     table.insert(sunionstore_args, parent_key)
 end
 
@@ -113,7 +117,7 @@ for i = NUM_ARGS + 1, NUM_ARGS + num_spans do
     table.insert(hset_args, span_id)
     table.insert(hset_args, set_span_id)
 
-    if not is_root_span then
+    if not is_root_span and write_merged_payloads then
         local span_key = string.format("span-buf:s:{%s}:%s", project_and_trace, span_id)
         table.insert(sunionstore_args, span_key)
     end
@@ -128,7 +132,83 @@ table.insert(latency_table, {"sunionstore_args_step_latency_ms", sunionstore_arg
 -- Merge spans into the parent span set.
 -- Used outside the if statement
 local arg_cleanup_end_time_ms = sunionstore_args_end_time_ms
-if #sunionstore_args > 0 then
+-- Maintain member-keys (span-buf:mk) tracking sets so the flusher
+-- knows which distributed keys to fetch. This runs in both write-only and
+-- full distributed mode.
+if write_distributed_payloads then
+    local member_keys_key = string.format("span-buf:mk:{%s}:%s", project_and_trace, set_span_id)
+    redis.call("sadd", member_keys_key, parent_span_id)
+
+    -- Merge child tracking sets from span_ids that were previously segment roots.
+    for i = NUM_ARGS + 1, NUM_ARGS + num_spans do
+        local span_id = ARGV[i]
+        if span_id ~= parent_span_id then
+            local child_mk_key = string.format("span-buf:mk:{%s}:%s", project_and_trace, span_id)
+            local child_members = redis.call("smembers", child_mk_key)
+            if #child_members > 0 then
+                redis.call("sadd", member_keys_key, unpack(child_members))
+                redis.call("del", child_mk_key)
+            end
+        end
+    end
+
+    -- Merge parent's tracking set if parent_span_id redirected to a different root.
+    if set_span_id ~= parent_span_id then
+        local parent_mk_key = string.format("span-buf:mk:{%s}:%s", project_and_trace, parent_span_id)
+        local parent_members = redis.call("smembers", parent_mk_key)
+        if #parent_members > 0 then
+            redis.call("sadd", member_keys_key, unpack(parent_members))
+            redis.call("del", parent_mk_key)
+        end
+    end
+
+    redis.call("expire", member_keys_key, set_timeout)
+    arg_cleanup_end_time_ms = get_time_ms()
+    table.insert(latency_table, {"distributed_tracking_step_latency_ms", arg_cleanup_end_time_ms - sunionstore_args_end_time_ms})
+end
+
+-- When write_merged_payloads is false, merged set merges are skipped but we
+-- still need to merge ic/ibc counters from child keys into the segment root.
+if not write_merged_payloads then
+    local ingested_count_key = string.format("span-buf:ic:%s", set_key)
+    local ingested_byte_count_key = string.format("span-buf:ibc:%s", set_key)
+    for i = NUM_ARGS + 1, NUM_ARGS + num_spans do
+        local span_id = ARGV[i]
+        if span_id ~= parent_span_id then
+            local child_merged = string.format("span-buf:s:{%s}:%s", project_and_trace, span_id)
+            local child_ic_key = string.format("span-buf:ic:%s", child_merged)
+            local child_ibc_key = string.format("span-buf:ibc:%s", child_merged)
+            local child_count = redis.call("get", child_ic_key)
+            local child_byte_count = redis.call("get", child_ibc_key)
+            if child_count then
+                redis.call("incrby", ingested_count_key, child_count)
+                redis.call("del", child_ic_key)
+            end
+            if child_byte_count then
+                redis.call("incrby", ingested_byte_count_key, child_byte_count)
+                redis.call("del", child_ibc_key)
+            end
+        end
+    end
+    if set_span_id ~= parent_span_id then
+        local parent_merged = string.format("span-buf:s:{%s}:%s", project_and_trace, parent_span_id)
+        local parent_ic_key = string.format("span-buf:ic:%s", parent_merged)
+        local parent_ibc_key = string.format("span-buf:ibc:%s", parent_merged)
+        local parent_count = redis.call("get", parent_ic_key)
+        local parent_byte_count = redis.call("get", parent_ibc_key)
+        if parent_count then
+            redis.call("incrby", ingested_count_key, parent_count)
+            redis.call("del", parent_ic_key)
+        end
+        if parent_byte_count then
+            redis.call("incrby", ingested_byte_count_key, parent_byte_count)
+            redis.call("del", parent_ibc_key)
+        end
+    end
+    arg_cleanup_end_time_ms = get_time_ms()
+    table.insert(latency_table, {"distributed_ibc_merge_step_latency_ms", arg_cleanup_end_time_ms - sunionstore_args_end_time_ms})
+
+elseif #sunionstore_args > 0 then
     local dest_memory = redis.call("memory", "usage", set_key) or 0
     local ingested_byte_count_key = string.format("span-buf:ibc:%s", set_key)
     local dest_bytes = tonumber(redis.call("get", ingested_byte_count_key) or 0)
@@ -210,7 +290,9 @@ redis.call("incrby", ingested_byte_count_key, byte_count)
 redis.call("expire", ingested_count_key, set_timeout)
 redis.call("expire", ingested_byte_count_key, set_timeout)
 
-redis.call("expire", set_key, set_timeout)
+if write_merged_payloads then
+    redis.call("expire", set_key, set_timeout)
+end
 
 local ingested_count_end_time_ms = get_time_ms()
 local ingested_count_step_latency_ms = ingested_count_end_time_ms - arg_cleanup_end_time_ms
 
@@ -91,7 +91,12 @@
 )
 from sentry.spans.consumers.process_segments.types import attribute_value
 from sentry.spans.debug_trace_logger import DebugTraceLogger
-from sentry.spans.segment_key import SegmentKey, parse_segment_key, segment_key_to_span_id
+from sentry.spans.segment_key import (
+    DistributedPayloadKey,
+    SegmentKey,
+    parse_segment_key,
+    segment_key_to_span_id,
+)
 from sentry.utils import metrics, redis
 from sentry.utils.outcomes import Outcome, track_outcome
 
@@ -137,6 +142,7 @@ class FlushedSegment(NamedTuple):
     queue_key: QueueKey
     spans: list[OutputSpan]
     project_id: int  # Used to track outcomes
+    distributed_payload_keys: list[DistributedPayloadKey] = []  # For cleanup
 
 
 class SpansBuffer:
@@ -152,6 +158,7 @@ def __init__(self, assigned_shards: list[int], slice_id: int | None = None):
         self._buffer_logger = BufferLogger()
         self._flusher_logger = FlusherLogger()
         self._debug_trace_logger: DebugTraceLogger | None = None
+        self._distributed_payload_keys_map: dict[SegmentKey, list[bytes]] = {}
 
     @cached_property
     def client(self) -> RedisCluster[bytes] | StrictRedis[bytes]:
@@ -164,6 +171,30 @@ def __reduce__(self):
     def _get_span_key(self, project_and_trace: str, span_id: str) -> bytes:
         return f"span-buf:s:{{{project_and_trace}}}:{span_id}".encode("ascii")
 
+    def _get_distributed_payload_key(
+        self, project_and_trace: str, span_id: str
+    ) -> DistributedPayloadKey:
+        return f"span-buf:s:{{{project_and_trace}:{span_id}}}:{span_id}".encode("ascii")
+
+    def _get_payload_key_index(self, segment_key: SegmentKey) -> bytes:
+        project_id, trace_id, span_id = parse_segment_key(segment_key)
+        return b"span-buf:mk:{%s:%s}:%s" % (project_id, trace_id, span_id)
+
+    def _cleanup_distributed_keys(self, segment_keys: set[SegmentKey]) -> None:
+        """Delete member-keys tracking sets and distributed payload keys for the
+        given segments, and remove them from the payload keys map so
+        done_flush_segments doesn't try again."""
+        with self.client.pipeline(transaction=False) as p:
+            for key in segment_keys:
+                payload_keys = self._distributed_payload_keys_map.get(key, [])
+                if payload_keys:
+                    mk_key = self._get_payload_key_index(key)
+                    p.delete(mk_key)
+                    for batch in itertools.batched(payload_keys, 100):
+                        p.unlink(*batch)
+                self._distributed_payload_keys_map.pop(key, None)
+            p.execute()
+
     @metrics.wraps("spans.buffer.process_spans")
     def process_spans(self, spans: Sequence[Span], now: int):
         """
@@ -186,6 +217,8 @@ def process_spans(self, spans: Sequence[Span], now: int):
         max_segment_bytes = options.get("spans.buffer.max-segment-bytes")
         max_spans_per_evalsha = options.get("spans.buffer.max-spans-per-evalsha")
         zero_copy_threshold = options.get("spans.buffer.zero-copy-dest-threshold-bytes")
+        write_distributed_payloads = options.get("spans.buffer.write-distributed-payloads")
+        write_merged_payloads = options.get("spans.buffer.write-merged-payloads")
 
         result_meta = []
         is_root_span_count = 0
@@ -214,8 +247,17 @@ def process_spans(self, spans: Sequence[Span], now: int):
                 with self.client.pipeline(transaction=False) as p:
                     for (project_and_trace, parent_span_id), subsegment in batch:
                         set_members = self._prepare_payloads(subsegment)
-                        set_key = self._get_span_key(project_and_trace, parent_span_id)
-                        p.sadd(set_key, *set_members.keys())
+                        if write_distributed_payloads:
+                            # Write to distributed key.
+                            dist_key = self._get_distributed_payload_key(
+                                project_and_trace, parent_span_id
+                            )
+                            p.sadd(dist_key, *set_members.keys())
+                            p.expire(dist_key, redis_ttl)
+
+                        if write_merged_payloads:
+                            set_key = self._get_span_key(project_and_trace, parent_span_id)
+                            p.sadd(set_key, *set_members.keys())
 
                     p.execute()
 
@@ -257,6 +299,8 @@ def process_spans(self, spans: Sequence[Span], now: int):
                             max_segment_bytes,
                             byte_count,
                             zero_copy_threshold,
+                            "true" if write_distributed_payloads else "false",
+                            "true" if write_merged_payloads else "false",
                             *span_ids,
                         )
 
@@ -565,6 +609,7 @@ def flush_segments(self, now: int) -> dict[SegmentKey, FlushedSegment]:
                 queue_key=queue_key,
                 spans=output_spans,
                 project_id=int(project_id.decode("ascii")),
+                distributed_payload_keys=self._distributed_payload_keys_map.get(segment_key, []),
             )
             num_has_root_spans += int(has_root_span)
 
@@ -659,13 +704,50 @@ def _load_segment_data(self, segment_keys: list[SegmentKey]) -> dict[SegmentKey,
 
         page_size = options.get("spans.buffer.segment-page-size")
         max_segment_bytes = options.get("spans.buffer.max-segment-bytes")
+        read_distributed_payloads = options.get("spans.buffer.read-distributed-payloads")
+        write_distributed_payloads = options.get("spans.buffer.write-distributed-payloads")
 
         payloads: dict[SegmentKey, list[bytes]] = {key: [] for key in segment_keys}
-        cursors = {key: 0 for key in segment_keys}
         sizes: dict[SegmentKey, int] = {key: 0 for key in segment_keys}
         self._last_decompress_latency_ms = 0
         decompress_latency_ms = 0.0
 
+        # Maps each scan key back to the segment it belongs to. For merged
+        # keys these are the same; for distributed keys many map to one segment.
+        scan_key_to_segment: dict[SegmentKey | DistributedPayloadKey, SegmentKey] = {}
+
+        # When read_distributed_payloads is off, scan merged segment keys directly.
+        # When on, skip them — all data lives in distributed keys.
+        cursors: dict[bytes, int] = {}
+        if not read_distributed_payloads:
+            for key in segment_keys:
+                scan_key_to_segment[key] = key
+                cursors[key] = 0
+
+        self._distributed_payload_keys_map = {}
+
+        if write_distributed_payloads:
+            with self.client.pipeline(transaction=False) as p:
+                for key in segment_keys:
+                    p.smembers(self._get_payload_key_index(key))
+                mk_results = p.execute()
+
+            for key, sub_span_ids in zip(segment_keys, mk_results):
+                project_id, trace_id, _ = parse_segment_key(key)
+                pat = f"{project_id.decode('ascii')}:{trace_id.decode('ascii')}"
+                distributed_keys: list[bytes] = []
+                for sub_span_id in sub_span_ids:
+                    distributed_key = self._get_distributed_payload_key(
+                        pat, sub_span_id.decode("ascii")
+                    )
+                    distributed_keys.append(distributed_key)
+                    if read_distributed_payloads:
+                        scan_key_to_segment[distributed_key] = key
+                        cursors[distributed_key] = 0
+                self._distributed_payload_keys_map[key] = distributed_keys
+
+        dropped_segments: set[SegmentKey] = set()
+
         def _add_spans(key: SegmentKey, raw_data: bytes) -> bool:
             """
             Decompress and add spans to the segment. Returns False if the
@@ -683,6 +765,7 @@ def _add_spans(key: SegmentKey, raw_data: bytes) -> bool:
                 logger.warning("Skipping too large segment, byte size %s", sizes[key])
                 payloads.pop(key, None)
                 sizes.pop(key, None)
+                dropped_segments.add(key)
                 return False
 
             payloads[key].extend(decompressed)
@@ -698,10 +781,15 @@ def _add_spans(key: SegmentKey, raw_data: bytes) -> bool:
                 scan_results = p.execute()
 
             for key, (cursor, scan_values) in zip(current_keys, scan_results):
+                segment_key = scan_key_to_segment[key]
+                if segment_key in dropped_segments:
+                    cursors.pop(key, None)
+                    continue
+
                 size_exceeded = False
                 for scan_value in scan_values:
-                    if key in payloads:
-                        if not _add_spans(key, scan_value):
+                    if segment_key in payloads:
+                        if not _add_spans(segment_key, scan_value):
                             size_exceeded = True
 
                 if size_exceeded:
@@ -711,6 +799,9 @@ def _add_spans(key: SegmentKey, raw_data: bytes) -> bool:
                 else:
                     cursors[key] = cursor
 
+        if dropped_segments:
+            self._cleanup_distributed_keys(dropped_segments)
+
         # Fetch ingested counts for all segments to calculate dropped spans
         with self.client.pipeline(transaction=False) as p:
             for key in segment_keys:
@@ -743,18 +834,18 @@ def _add_spans(key: SegmentKey, raw_data: bytes) -> bool:
                     continue
 
                 project_id_bytes, _, _ = parse_segment_key(key)
-                project_id = int(project_id_bytes)
+                project_id_int = int(project_id_bytes)
                 try:
-                    project = Project.objects.get_from_cache(id=project_id)
+                    project = Project.objects.get_from_cache(id=project_id_int)
                 except Project.DoesNotExist:
                     logger.warning(
                         "Project does not exist for segment with dropped spans",
-                        extra={"project_id": project_id},
+                        extra={"project_id": project_id_int},
                     )
                 else:
                     track_outcome(
                         org_id=project.organization_id,
-                        project_id=project_id,
+                        project_id=project_id_int,
                         key_id=None,
                         outcome=Outcome.INVALID,
                         reason="segment_too_large",
@@ -800,6 +891,14 @@ def done_flush_segments(self, segment_keys: dict[SegmentKey, FlushedSegment]):
                         span_ids = [output_span.payload["span_id"] for output_span in span_batch]
                         p.hdel(redirect_map_key, *span_ids)
 
+                    if flushed_segment.distributed_payload_keys:
+                        mk_key = self._get_payload_key_index(segment_key)
+                        p.delete(mk_key)
+                        for distributed_key_batch in itertools.batched(
+                            flushed_segment.distributed_payload_keys, 100
+                        ):
+                            p.unlink(*distributed_key_batch)
+
                 for queue_key, keys in queue_removals.items():
                     for key_batch in itertools.batched(keys, 100):
                         p.zrem(queue_key, *key_batch)