ref(span-buffer): Test span buffer against examples (#109318)

untitaker · web-flow · commit 7fdfff6d382a · 2026-03-16T18:22:19.000Z
* Feed every span example into the buffer just to ensure it does not crash. * Flush the buffer. * Validate the output span is also valid according to the schema, and that the output segment is also valid according to that other schema. This is related to INC-2053, without the actual fix. #109316 will be merged, then this PR should pass.
diff --git a/src/sentry/spans/buffer.py b/src/sentry/spans/buffer.py
@@ -70,12 +70,13 @@
 import math
 import time
 from collections.abc import Generator, MutableMapping, Sequence
-from typing import Any, NamedTuple
+from typing import Any, NamedTuple, cast
 
 import orjson
 import zstandard
 from django.conf import settings
 from django.utils.functional import cached_property
+from sentry_kafka_schemas.schema_types.ingest_spans_v1 import SpanEvent
 from sentry_redis_tools.clients import RedisCluster, StrictRedis
 
 from sentry import options
@@ -612,12 +613,13 @@ def flush_segments(self, now: int) -> dict[SegmentKey, FlushedSegment]:
             # This incr metric is needed to get a rate overall.
             metrics.incr("spans.buffer.flush_segments.count_spans_per_segment", amount=len(segment))
             for payload in segment:
-                span = orjson.loads(payload)
+                span: SpanEvent = orjson.loads(payload)
 
                 if not attribute_value(span, "sentry.segment.id"):
-                    if not isinstance(span.get("attributes"), dict):
-                        span["attributes"] = {}
-                    span["attributes"]["sentry.segment.id"] = {
+                    attributes = span.get("attributes")
+                    if not isinstance(attributes, dict):
+                        span["attributes"] = attributes = {}
+                    attributes["sentry.segment.id"] = {
                         "type": "string",
                         "value": segment_span_id,
                     }
@@ -627,7 +629,7 @@ def flush_segments(self, now: int) -> dict[SegmentKey, FlushedSegment]:
                 if is_segment:
                     has_root_span = True
 
-                output_spans.append(OutputSpan(payload=span))
+                output_spans.append(OutputSpan(payload=cast(dict[str, Any], span)))
 
             metrics.incr(
                 "spans.buffer.flush_segments.num_segments_per_shard", tags={"shard_i": shard}
diff --git a/tests/sentry/spans/test_buffer.py b/tests/sentry/spans/test_buffer.py
@@ -2,13 +2,19 @@
 
 import itertools
 from collections.abc import Sequence
+from typing import cast
 from unittest import mock
 
 import orjson
 import pytest
+import sentry_kafka_schemas
+from sentry_kafka_schemas.schema_types.ingest_spans_v1 import SpanEvent
 from sentry_redis_tools.clients import StrictRedis
 
+from sentry.conf.types.kafka_definition import Topic, get_topic_codec
 from sentry.spans.buffer import FlushedSegment, OutputSpan, Span, SpansBuffer
+from sentry.spans.consumers.process.factory import SPANS_CODEC, validate_span_event
+from sentry.spans.consumers.process_segments.types import attribute_value
 from sentry.spans.segment_key import SegmentKey
 from sentry.testutils.helpers.options import override_options
 
@@ -32,6 +38,7 @@
     "spans.buffer.evalsha-latency-threshold": 100,
     "spans.buffer.debug-traces": [],
     "spans.buffer.evalsha-cumulative-logger-enabled": True,
+    "spans.process-segments.schema-validation": 1.0,
     "spans.buffer.zero-copy-dest-threshold-bytes": 0,
     "spans.buffer.done-flush-conditional-zrem": False,
     "spans.buffer.write-distributed-payloads": False,
@@ -1646,3 +1653,69 @@ def test_distributed_transition_write_then_read() -> None:
         assert len(rv[seg_key].spans) == 2
         buf.done_flush_segments(rv)
         assert_clean_distributed(buf.client)
+
+
+def _get_schema_examples():
+    """Load all ingest-spans schema examples for parametrization."""
+    examples = []
+    for ex in sentry_kafka_schemas.iter_examples("ingest-spans"):
+        examples.append(pytest.param(ex.load(), id=ex.path.stem))
+    return examples
+
+
+@pytest.mark.parametrize("example", _get_schema_examples())
+def test_schema_examples(buffer: SpansBuffer, example: dict) -> None:
+    """
+    Feed official ingest-spans schema examples through the buffer pipeline
+    to verify they are handled without errors.
+    """
+    # Replicate the parsing logic from process_batch() in factory.py
+    segment_id = cast(str | None, attribute_value(example, "sentry.segment.id"))
+    validate_span_event(cast(SpanEvent, example), segment_id)
+
+    payload = orjson.dumps(example)
+
+    span = Span(
+        trace_id=example["trace_id"],
+        span_id=example["span_id"],
+        parent_span_id=example.get("parent_span_id"),
+        segment_id=segment_id,
+        project_id=example["project_id"],
+        payload=payload,
+        end_timestamp=cast(float, example["end_timestamp"]),
+        is_segment_span=bool(example.get("parent_span_id") is None or example.get("is_segment")),
+    )
+
+    process_spans([span], buffer, now=0)
+    assert_ttls(buffer.client)
+
+    # Flush past both root-timeout (10s) and timeout (60s)
+    rv = buffer.flush_segments(now=61)
+
+    assert len(rv) == 1
+    segment = list(rv.values())[0]
+    assert len(segment.spans) == 1
+
+    output_span = segment.spans[0]
+    assert output_span.payload["span_id"] == example["span_id"]
+    assert output_span.payload["trace_id"] == example["trace_id"]
+
+    # Verify top-level keys are preserved (except is_segment and attributes
+    # which the buffer modifies)
+    for key in example:
+        if key in ("is_segment", "attributes"):
+            continue
+        assert key in output_span.payload, f"Key {key!r} missing from output payload"
+
+    # Validate that the output span still conforms to the ingest-spans schema.
+    # It's not explicitly written anywhere that the spans schema in
+    # buffered-segments is the same one as the input schema, but right now
+    # that's what it is.
+    SPANS_CODEC.validate(cast(SpanEvent, output_span.payload))
+
+    # Validate that the assembled segment conforms to the buffered-segments schema
+    buffered_segments_codec = get_topic_codec(Topic.BUFFERED_SEGMENTS)
+    buffered_segments_codec.validate({"spans": [span.payload for span in segment.spans]})
+
+    buffer.done_flush_segments(rv)
+    assert_clean(buffer.client)