diff --git a/tests/integration-tests/tests/update/test_update_race_conditions.py b/tests/integration-tests/tests/update/test_update_race_conditions.py index 640970cc58..5a0744b2f3 100644 --- a/tests/integration-tests/tests/update/test_update_race_conditions.py +++ b/tests/integration-tests/tests/update/test_update_race_conditions.py @@ -169,6 +169,19 @@ def wait_for_node_update_failure(rce, cluster, node_type, node_id, after_utc=Non ) +@retry(wait_fixed=seconds(10), stop_max_delay=minutes(30)) +def wait_for_head_node_update_recipe_complete(rce, after_utc=None): + match, matched_content = match_regex_in_log( + rce, + "/var/log/chef-client.log", + r"Cinc Client Run complete", + after_utc=after_utc, + ) + if not match: + raise Exception("Head node chef-client run has not completed yet.") + logger.info(f"Head node chef-client run completed: {matched_content}") + + @retry(wait_fixed=seconds(10), stop_max_delay=minutes(30)) def wait_for_stack_rollback_complete(cluster): client = boto3.client("cloudformation", region_name=cluster.region) @@ -375,32 +388,44 @@ def assert_update_recipe_executed_on_old_nodes(snapshot, update_submitted_at, ex ).is_not_empty() -def is_update_failure_only_due_to_known_race_condition(rce, instance_ids, after_utc): - """Check if the last readiness check failure was caused only by the given instance IDs. - - Parses the last "wrong records (1)" line from /var/log/chef-client.log after the given timestamp - and extracts the instance IDs listed there. +def is_update_failure_only_due_to_known_race_condition(rce, instance_ids: list, after_utc: str): + """Check if the last readiness check failure was caused only by the given instance IDs by inspecting the + errors returned by the cluster readiness check in /var/log/chef-client.log Returns True if all wrong-record instance IDs are in the provided set. Returns False if there are no wrong records or if other instance IDs are present. """ - found, output = match_regex_in_log( + found, lines = match_regex_in_log( rce, "/var/log/chef-client.log", - r"wrong records \(1\):.*", + r"Retrying execution of execute\[Check cluster readiness\], 0 attempt left", after_utc=after_utc, + nlines_after_match=50, ) if not found: logger.info( - "No 'wrong records (1)' line found in chef-client.log after the update. " + "No final readiness check retry found in chef-client.log after the update. " "The cluster update failure, if observed, is not caused by the known race condition." ) return False - instande_ids_with_wrong_record = set(re.findall(r"'(i-[0-9a-f]+)'", output)) + + logger.info(f"Found last readiness check retry: {lines}") + + wrong_records_match = re.search(r"\* wrong records \(1\):.*", lines) + if not wrong_records_match: + logger.info( + "No '* wrong records (1):' line found in the last readiness check retry output. " + "The cluster update failure, if observed, is not caused by the known race condition." + ) + return False + + wrong_records_line = wrong_records_match.group() + instance_ids_with_wrong_record = set(re.findall(r"'(i-[0-9a-f]+)'", wrong_records_line)) logger.info( - f"Readiness check wrong records: {output}. " - f"Instance IDs with wrong records: {instande_ids_with_wrong_record}" + f"Readiness check wrong records: {wrong_records_line}. " + f"Instance IDs with wrong records: {instance_ids_with_wrong_record}. " + f"Instance IDs with expected failure: {instance_ids}" ) - return instande_ids_with_wrong_record == set(instance_ids) + return instance_ids_with_wrong_record == set(instance_ids) # CN_1: a dynamic compute node that completes the bootstrap after the readiness check @@ -626,6 +651,14 @@ def test_update_race_conditions( logger.info("Verifications started") logger.info(f"Cluster stack status is: {actual_cluster_stack_status}") + + # Known Issue: when the cluster stack reaches the state UPDATE_ROLLBACK_COMPLETE, + # the head node may not have completed the update recipe yet. + # So, before collecting the cluster snapshot we must wait for it to prevent false positive test failures. + if actual_cluster_stack_status == "UPDATE_ROLLBACK_COMPLETE": + logger.info("Waiting for head node to complete the update recipe before collecting the cluster snapshot") + wait_for_head_node_update_recipe_complete(rce, after_utc=update_2_submit_time) + logger.info("Collecting cluster nodes snapshot") cluster_snapshot = get_cluster_nodes_snapshot(cluster) logger.info(f"Cluster snapshot:\n{json.dumps(cluster_snapshot, indent=2)}") @@ -633,7 +666,7 @@ def test_update_race_conditions( with soft_assertions(): # TODO: Once [RaceCondition 5] will be fixed, we will require cluster stack to always be in UPDATE_COMPLETE if actual_cluster_stack_status != "UPDATE_COMPLETE": - if is_update_failure_only_due_to_known_race_condition(rce, {login_node_1_id}, update_2_submit_time): + if is_update_failure_only_due_to_known_race_condition(rce, [login_node_1_id], update_2_submit_time): logger.warning( f"Cluster stack status is {actual_cluster_stack_status} but the readiness check failed " f"only because of {login_node_1_id} (known RaceCondition 5, not blocking the test)" diff --git a/tests/integration-tests/utils.py b/tests/integration-tests/utils.py index 6c45f8e37a..6b5eee8bde 100644 --- a/tests/integration-tests/utils.py +++ b/tests/integration-tests/utils.py @@ -1141,12 +1141,28 @@ def get_file_mtime_age_seconds(remote_command_executor, file_path): return current_time - mtime +# Matches ISO-like timestamps at the start of a log line, with or without brackets. +# Examples: [2026-03-19T02:44:01+00:00], 2026-03-19 02:44:01, 2026-03-19T02:44:01.000Z +_LOG_TIMESTAMP_RE = re.compile(r"^\[?(\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2})") + + +def get_log_after_timestamp(log_content: str, after_utc: str) -> str: + after_utc_prefix = after_utc[:19].replace(" ", "T") + lines = log_content.splitlines() + for i, line in enumerate(lines): + m = _LOG_TIMESTAMP_RE.match(line) + if m and m.group(1).replace(" ", "T") >= after_utc_prefix: + return "\n".join(lines[i:]) + return "" + + def match_regex_in_log( rce, log_file: str, pattern: str, nlines: int = 50, after_utc: str = None, + nlines_after_match: int = 0, ) -> tuple[bool, str]: """ Search for a regex pattern in a remote log file, optionally filtering to lines after a UTC timestamp. @@ -1158,22 +1174,27 @@ def match_regex_in_log( nlines: Number of lines from the end of the log to return if pattern is not found. Defaults to 50. after_utc: Optional UTC timestamp string in ISO format (e.g. "2026-03-10T15:00:00.000Z"). Only lines at or after this timestamp are considered. If None, searches the entire log. + nlines_after_match: Number of lines to include after the matched line when the pattern is found. Defaults to 0 + (only the matched line is returned). Returns: A tuple containing: - bool: True if the pattern was found, False otherwise. - - str: The matched string if found, or the last `nlines` lines of the log if not found. + - str: If found, the full line containing the match plus the next `nlines_after_match` lines. + If not found, the last `nlines` lines of the log. """ - if after_utc: - cmd = f"awk '$0 >= \"{after_utc}\"' {log_file} 2>/dev/null || echo ''" - result = rce.run_remote_command(cmd, raise_on_error=False) - else: - result = rce.run_remote_command(f"cat {log_file}") + result = rce.run_remote_command(f"cat {log_file}") log_content = result.stdout + if after_utc: + log_content = get_log_after_timestamp(log_content, after_utc) + match = re.search(pattern, log_content) if match: - return True, match.group() + # Find the start of the line containing the match, then take nlines_after_match additional lines + match_line_start = log_content.rfind("\n", 0, match.start()) + 1 + result_lines = log_content[match_line_start:].splitlines()[: 1 + nlines_after_match] + return True, "\n".join(result_lines) else: last_lines = "\n".join(log_content.splitlines()[-nlines:]) return False, last_lines