diff --git a/src/sentry/issues/endpoints/group_similar_issues_embeddings.py b/src/sentry/issues/endpoints/group_similar_issues_embeddings.py index 8a9c5ba301bda5..996f7628d47cc1 100644 --- a/src/sentry/issues/endpoints/group_similar_issues_embeddings.py +++ b/src/sentry/issues/endpoints/group_similar_issues_embeddings.py @@ -66,6 +66,12 @@ def get_formatted_results( } for similar_issue_data in similar_issues_data: if parent_hashes_group_ids[similar_issue_data.parent_hash] != group.id: + # Results are sorted by ascending distance, so the first occurrence of each + # group has the best (lowest distance / highest similarity) score. Skip + # duplicates to avoid overwriting a better score with a worse one while + # keeping the dict insertion position from the first (best) entry. + if similar_issue_data.parent_group_id in group_data: + continue formatted_response: FormattedSimilarIssuesEmbeddingsData = { "exception": round(1 - similar_issue_data.stacktrace_distance, 4), "shouldBeGrouped": "Yes" if similar_issue_data.should_group else "No", diff --git a/tests/sentry/issues/endpoints/test_group_similar_issues_embeddings.py b/tests/sentry/issues/endpoints/test_group_similar_issues_embeddings.py index 44737fe2cbc7f9..6e1b063b700100 100644 --- a/tests/sentry/issues/endpoints/test_group_similar_issues_embeddings.py +++ b/tests/sentry/issues/endpoints/test_group_similar_issues_embeddings.py @@ -139,6 +139,56 @@ def test_get_formatted_results(self) -> None: ["Yes", "No"], ) + def test_get_formatted_results_deduplicates_groups(self) -> None: + event_from_second_similar_group = save_new_event({"message": "example"}, self.project) + assert self.similar_event.group_id is not None + assert event_from_second_similar_group.group_id is not None + + second_hash = GroupHash.objects.create( + project=self.project, + group_id=self.similar_event.group_id, + hash="duplicate_hash_for_same_group", + ) + + similar_issue_data_best = SeerSimilarIssueData( + parent_group_id=self.similar_event.group_id, + parent_hash=self.similar_event.get_primary_hash(), + should_group=True, + stacktrace_distance=0.01, + ) + similar_issue_data_other = SeerSimilarIssueData( + parent_group_id=event_from_second_similar_group.group_id, + parent_hash=event_from_second_similar_group.get_primary_hash(), + should_group=False, + stacktrace_distance=0.05, + ) + similar_issue_data_worse_dup = SeerSimilarIssueData( + parent_group_id=self.similar_event.group_id, + parent_hash=second_hash.hash, + should_group=True, + stacktrace_distance=0.10, + ) + + group_similar_endpoint = GroupSimilarIssuesEmbeddingsEndpoint() + formatted_results = group_similar_endpoint.get_formatted_results( + similar_issues_data=[ + similar_issue_data_best, + similar_issue_data_other, + similar_issue_data_worse_dup, + ], + user=self.user, + group=self.group, + ) + + assert formatted_results == self.get_expected_response( + [ + self.similar_event.group_id, + event_from_second_similar_group.group_id, + ], + [0.99, 0.95], + ["Yes", "No"], + ) + @mock.patch("sentry.seer.similarity.similar_issues.metrics.incr") @mock.patch("sentry.seer.similarity.similar_issues.seer_grouping_connection_pool.urlopen") @mock.patch("sentry.issues.endpoints.group_similar_issues_embeddings.logger")