Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,19 @@ def wait_for_node_update_failure(rce, cluster, node_type, node_id, after_utc=Non
)


@retry(wait_fixed=seconds(10), stop_max_delay=minutes(30))
def wait_for_head_node_update_recipe_complete(rce, after_utc=None):
match, matched_content = match_regex_in_log(
rce,
"/var/log/chef-client.log",
r"Cinc Client Run complete",
after_utc=after_utc,
)
if not match:
raise Exception("Head node chef-client run has not completed yet.")
logger.info(f"Head node chef-client run completed: {matched_content}")


@retry(wait_fixed=seconds(10), stop_max_delay=minutes(30))
def wait_for_stack_rollback_complete(cluster):
client = boto3.client("cloudformation", region_name=cluster.region)
Expand Down Expand Up @@ -375,32 +388,44 @@ def assert_update_recipe_executed_on_old_nodes(snapshot, update_submitted_at, ex
).is_not_empty()


def is_update_failure_only_due_to_known_race_condition(rce, instance_ids, after_utc):
"""Check if the last readiness check failure was caused only by the given instance IDs.

Parses the last "wrong records (1)" line from /var/log/chef-client.log after the given timestamp
and extracts the instance IDs listed there.
def is_update_failure_only_due_to_known_race_condition(rce, instance_ids: list, after_utc: str):
"""Check if the last readiness check failure was caused only by the given instance IDs by inspecting the
errors returned by the cluster readiness check in /var/log/chef-client.log
Returns True if all wrong-record instance IDs are in the provided set.
Returns False if there are no wrong records or if other instance IDs are present.
"""
found, output = match_regex_in_log(
found, lines = match_regex_in_log(
rce,
"/var/log/chef-client.log",
r"wrong records \(1\):.*",
r"Retrying execution of execute\[Check cluster readiness\], 0 attempt left",
after_utc=after_utc,
nlines_after_match=50,
)
if not found:
logger.info(
"No 'wrong records (1)' line found in chef-client.log after the update. "
"No final readiness check retry found in chef-client.log after the update. "
"The cluster update failure, if observed, is not caused by the known race condition."
)
return False
instande_ids_with_wrong_record = set(re.findall(r"'(i-[0-9a-f]+)'", output))

logger.info(f"Found last readiness check retry: {lines}")

wrong_records_match = re.search(r"\* wrong records \(1\):.*", lines)
if not wrong_records_match:
logger.info(
"No '* wrong records (1):' line found in the last readiness check retry output. "
"The cluster update failure, if observed, is not caused by the known race condition."
)
return False

wrong_records_line = wrong_records_match.group()
instance_ids_with_wrong_record = set(re.findall(r"'(i-[0-9a-f]+)'", wrong_records_line))
logger.info(
f"Readiness check wrong records: {output}. "
f"Instance IDs with wrong records: {instande_ids_with_wrong_record}"
f"Readiness check wrong records: {wrong_records_line}. "
f"Instance IDs with wrong records: {instance_ids_with_wrong_record}. "
f"Instance IDs with expected failure: {instance_ids}"
)
return instande_ids_with_wrong_record == set(instance_ids)
return instance_ids_with_wrong_record == set(instance_ids)


# CN_1: a dynamic compute node that completes the bootstrap after the readiness check
Expand Down Expand Up @@ -626,14 +651,22 @@ def test_update_race_conditions(

logger.info("Verifications started")
logger.info(f"Cluster stack status is: {actual_cluster_stack_status}")

# Known Issue: when the cluster stack reaches the state UPDATE_ROLLBACK_COMPLETE,
# the head node may not have completed the update recipe yet.
# So, before collecting the cluster snapshot we must wait for it to prevent false positive test failures.
if actual_cluster_stack_status == "UPDATE_ROLLBACK_COMPLETE":
logger.info("Waiting for head node to complete the update recipe before collecting the cluster snapshot")
wait_for_head_node_update_recipe_complete(rce, after_utc=update_2_submit_time)

logger.info("Collecting cluster nodes snapshot")
cluster_snapshot = get_cluster_nodes_snapshot(cluster)
logger.info(f"Cluster snapshot:\n{json.dumps(cluster_snapshot, indent=2)}")

with soft_assertions():
# TODO: Once [RaceCondition 5] will be fixed, we will require cluster stack to always be in UPDATE_COMPLETE
if actual_cluster_stack_status != "UPDATE_COMPLETE":
if is_update_failure_only_due_to_known_race_condition(rce, {login_node_1_id}, update_2_submit_time):
if is_update_failure_only_due_to_known_race_condition(rce, [login_node_1_id], update_2_submit_time):
logger.warning(
f"Cluster stack status is {actual_cluster_stack_status} but the readiness check failed "
f"only because of {login_node_1_id} (known RaceCondition 5, not blocking the test)"
Expand Down
35 changes: 28 additions & 7 deletions tests/integration-tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1141,12 +1141,28 @@ def get_file_mtime_age_seconds(remote_command_executor, file_path):
return current_time - mtime


# Matches ISO-like timestamps at the start of a log line, with or without brackets.
# Examples: [2026-03-19T02:44:01+00:00], 2026-03-19 02:44:01, 2026-03-19T02:44:01.000Z
_LOG_TIMESTAMP_RE = re.compile(r"^\[?(\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2})")


def get_log_after_timestamp(log_content: str, after_utc: str) -> str:
after_utc_prefix = after_utc[:19].replace(" ", "T")
lines = log_content.splitlines()
for i, line in enumerate(lines):
m = _LOG_TIMESTAMP_RE.match(line)
if m and m.group(1).replace(" ", "T") >= after_utc_prefix:
return "\n".join(lines[i:])
return ""


def match_regex_in_log(
rce,
log_file: str,
pattern: str,
nlines: int = 50,
after_utc: str = None,
nlines_after_match: int = 0,
) -> tuple[bool, str]:
"""
Search for a regex pattern in a remote log file, optionally filtering to lines after a UTC timestamp.
Expand All @@ -1158,22 +1174,27 @@ def match_regex_in_log(
nlines: Number of lines from the end of the log to return if pattern is not found. Defaults to 50.
after_utc: Optional UTC timestamp string in ISO format (e.g. "2026-03-10T15:00:00.000Z"). Only lines at or after
this timestamp are considered. If None, searches the entire log.
nlines_after_match: Number of lines to include after the matched line when the pattern is found. Defaults to 0
(only the matched line is returned).

Returns:
A tuple containing:
- bool: True if the pattern was found, False otherwise.
- str: The matched string if found, or the last `nlines` lines of the log if not found.
- str: If found, the full line containing the match plus the next `nlines_after_match` lines.
If not found, the last `nlines` lines of the log.
"""
if after_utc:
cmd = f"awk '$0 >= \"{after_utc}\"' {log_file} 2>/dev/null || echo ''"
result = rce.run_remote_command(cmd, raise_on_error=False)
else:
result = rce.run_remote_command(f"cat {log_file}")
result = rce.run_remote_command(f"cat {log_file}")
log_content = result.stdout

if after_utc:
log_content = get_log_after_timestamp(log_content, after_utc)

match = re.search(pattern, log_content)
if match:
return True, match.group()
# Find the start of the line containing the match, then take nlines_after_match additional lines
match_line_start = log_content.rfind("\n", 0, match.start()) + 1
result_lines = log_content[match_line_start:].splitlines()[: 1 + nlines_after_match]
return True, "\n".join(result_lines)
else:
last_lines = "\n".join(log_content.splitlines()[-nlines:])
return False, last_lines
Expand Down
Loading