Skip to content

Commit a6bc1fe

Browse files
[4/n] IPv6 support: Add IPv6 support for sockets (#56147)
Signed-off-by: Yicheng-Lu-llll <luyc58576@gmail.com> Signed-off-by: Yicheng-Lu-llll <51814063+Yicheng-Lu-llll@users.noreply.github.com> Signed-off-by: Jiajun Yao <jeromeyjj@gmail.com> Co-authored-by: Jiajun Yao <jeromeyjj@gmail.com>
1 parent 444646e commit a6bc1fe

File tree

29 files changed

+369
-148
lines changed

29 files changed

+369
-148
lines changed

cpp/src/ray/runtime/native_ray_runtime.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "./object/object_store.h"
2121
#include "./task/native_task_submitter.h"
2222
#include "ray/common/ray_config.h"
23+
#include "ray/util/network_util.h"
2324

2425
namespace ray {
2526
namespace internal {
@@ -31,7 +32,7 @@ NativeRayRuntime::NativeRayRuntime() {
3132

3233
auto bootstrap_address = ConfigInternal::Instance().bootstrap_ip;
3334
if (bootstrap_address.empty()) {
34-
bootstrap_address = GetNodeIpAddress();
35+
bootstrap_address = ray::GetNodeIpAddressFromPerspective();
3536
}
3637
global_state_accessor_ = ProcessHelper::GetInstance().CreateGlobalStateAccessor(
3738
bootstrap_address, ConfigInternal::Instance().bootstrap_port);

cpp/src/ray/test/cluster/cluster_mode_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ TEST(RayClusterModeTest, FullTest) {
7171
auto port = absl::GetFlag<int32_t>(FLAGS_redis_port);
7272
std::string username = absl::GetFlag<std::string>(FLAGS_redis_username);
7373
std::string password = absl::GetFlag<std::string>(FLAGS_redis_password);
74-
std::string local_ip = ray::internal::GetNodeIpAddress();
74+
std::string local_ip = ray::GetNodeIpAddressFromPerspective();
7575
ray::internal::ProcessHelper::GetInstance().StartRayNode(
7676
local_ip, port, username, password);
7777
config.address = ray::BuildAddress(local_ip, port);

cpp/src/ray/util/process_helper.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ void ProcessHelper::RayStart(CoreWorkerOptions::TaskExecutionCallback callback)
8383

8484
if (ConfigInternal::Instance().worker_type == WorkerType::DRIVER &&
8585
bootstrap_ip.empty()) {
86-
bootstrap_ip = GetNodeIpAddress();
86+
bootstrap_ip = ray::GetNodeIpAddressFromPerspective();
8787
StartRayNode(bootstrap_ip,
8888
bootstrap_port,
8989
ConfigInternal::Instance().redis_username,
@@ -95,9 +95,9 @@ void ProcessHelper::RayStart(CoreWorkerOptions::TaskExecutionCallback callback)
9595
std::string node_ip = ConfigInternal::Instance().node_ip_address;
9696
if (node_ip.empty()) {
9797
if (!bootstrap_ip.empty()) {
98-
node_ip = GetNodeIpAddress(bootstrap_address);
98+
node_ip = ray::GetNodeIpAddressFromPerspective(bootstrap_address);
9999
} else {
100-
node_ip = GetNodeIpAddress();
100+
node_ip = ray::GetNodeIpAddressFromPerspective();
101101
}
102102
}
103103

cpp/src/ray/util/util.cc

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -24,27 +24,6 @@
2424
namespace ray {
2525
namespace internal {
2626

27-
std::string GetNodeIpAddress(const std::string &address) {
28-
auto parts = ParseAddress(address);
29-
RAY_CHECK(parts.has_value());
30-
try {
31-
boost::asio::io_service netService;
32-
boost::asio::ip::udp::resolver resolver(netService);
33-
boost::asio::ip::udp::resolver::query query(
34-
boost::asio::ip::udp::v4(), (*parts)[0], (*parts)[1]);
35-
boost::asio::ip::udp::resolver::iterator endpoints = resolver.resolve(query);
36-
boost::asio::ip::udp::endpoint ep = *endpoints;
37-
boost::asio::ip::udp::socket socket(netService);
38-
socket.connect(ep);
39-
boost::asio::ip::address addr = socket.local_endpoint().address();
40-
return addr.to_string();
41-
} catch (std::exception &e) {
42-
RAY_LOG(FATAL) << "Could not get the node IP address with socket. Exception: "
43-
<< e.what();
44-
return "";
45-
}
46-
}
47-
4827
std::string getLibraryPathEnv() {
4928
auto path_env_p = std::getenv(kLibraryPathEnvName);
5029
if (path_env_p != nullptr && strlen(path_env_p) != 0) {

cpp/src/ray/util/util.h

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,20 +18,6 @@
1818
namespace ray {
1919
namespace internal {
2020

21-
/// IP address by which the local node can be reached *from* the `address`.
22-
///
23-
/// The behavior should be the same as `node_ip_address_from_perspective` from Ray Python
24-
/// code. See
25-
/// https://stackoverflow.com/questions/2674314/get-local-ip-address-using-boost-asio.
26-
///
27-
/// TODO(kfstorm): Make this function shared code and migrate Python & Java to use this
28-
/// function.
29-
///
30-
/// \param address The IP address and port of any known live service on the network
31-
/// you care about.
32-
/// \return The IP address by which the local node can be reached from the address.
33-
std::string GetNodeIpAddress(const std::string &address = "8.8.8.8:53");
34-
3521
std::string getLibraryPathEnv();
3622

3723
} // namespace internal

doc/source/ray-core/examples/lm/ray_train.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from fairseq_cli.train import main
1212

1313
import ray
14-
from ray._common.network_utils import build_address
1514

1615
_original_save_checkpoint = fairseq.checkpoint_utils.save_checkpoint
1716

@@ -113,7 +112,7 @@ def run_fault_tolerant_loop():
113112
# fairseq distributed training.
114113
ip = ray.get(workers[0].get_node_ip.remote())
115114
port = ray.get(workers[0].find_free_port.remote())
116-
address = f"tcp://{build_address(ip, port)}"
115+
address = f"tcp://{ip}:{port}"
117116

118117
# Start the remote processes, and check whether their are any process
119118
# fails. If so, restart all the processes.

python/ray/_common/network_utils.py

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
1+
import socket
2+
from functools import lru_cache
13
from typing import Optional, Tuple, Union
24

3-
from ray._raylet import build_address as _build_address, parse_address as _parse_address
5+
from ray._raylet import (
6+
build_address as _build_address,
7+
is_ipv6 as _is_ipv6,
8+
node_ip_address_from_perspective as _node_ip_address_from_perspective,
9+
parse_address as _parse_address,
10+
)
411

512

613
def parse_address(address: str) -> Optional[Tuple[str, str]]:
@@ -28,6 +35,54 @@ def build_address(host: str, port: Union[int, str]) -> str:
2835
return _build_address(host, port)
2936

3037

38+
def node_ip_address_from_perspective(address: Optional[str] = None) -> str:
39+
"""IP address by which the local node can be reached *from* the `address`.
40+
41+
If no address is given, defaults to public DNS servers for detection.
42+
43+
Args:
44+
address: The IP address and port of any known live service on the
45+
network you care about.
46+
47+
Returns:
48+
The IP address by which the local node can be reached from the address.
49+
"""
50+
return _node_ip_address_from_perspective(address)
51+
52+
53+
def is_ipv6(host: str) -> bool:
54+
"""Check if a host is resolved to IPv6.
55+
56+
Args:
57+
host: The IP or domain name to check (must be without port).
58+
59+
Returns:
60+
True if the host is resolved to IPv6, False if IPv4.
61+
"""
62+
return _is_ipv6(host)
63+
64+
65+
@lru_cache(maxsize=1)
66+
def get_localhost_ip() -> str:
67+
"""Get localhost loopback ip with IPv4/IPv6 support.
68+
69+
Returns:
70+
The localhost loopback IP.
71+
"""
72+
# Try IPv4 first, then IPv6 localhost resolution
73+
for family in [socket.AF_INET, socket.AF_INET6]:
74+
try:
75+
dns_result = socket.getaddrinfo(
76+
"localhost", None, family, socket.SOCK_STREAM
77+
)
78+
return dns_result[0][4][0]
79+
except Exception:
80+
continue
81+
82+
# Final fallback to IPv4 loopback
83+
return "127.0.0.1"
84+
85+
3186
def is_localhost(host: str) -> bool:
3287
"""Check if the given host string represents a localhost address.
3388

python/ray/_private/node.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,12 @@
2222
import ray
2323
import ray._private.ray_constants as ray_constants
2424
import ray._private.services
25-
from ray._common.network_utils import build_address, parse_address
25+
from ray._common.network_utils import (
26+
build_address,
27+
get_localhost_ip,
28+
is_ipv6,
29+
parse_address,
30+
)
2631
from ray._common.ray_constants import LOGGING_ROTATE_BACKUP_COUNT, LOGGING_ROTATE_BYTES
2732
from ray._common.utils import try_to_create_directory
2833
from ray._private.resource_and_label_spec import ResourceAndLabelSpec
@@ -140,7 +145,7 @@ def __init__(
140145
)
141146

142147
self._resource_and_label_spec = None
143-
self._localhost = socket.gethostbyname("localhost")
148+
self._localhost = get_localhost_ip()
144149
self._ray_params = ray_params
145150
self._config = ray_params._system_config or {}
146151

@@ -882,7 +887,10 @@ def _get_unused_port(self, allocated_ports=None):
882887
if allocated_ports is None:
883888
allocated_ports = set()
884889

885-
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
890+
s = socket.socket(
891+
socket.AF_INET6 if is_ipv6(self._node_ip_address) else socket.AF_INET,
892+
socket.SOCK_STREAM,
893+
)
886894
s.bind(("", 0))
887895
port = s.getsockname()[1]
888896

@@ -895,7 +903,10 @@ def _get_unused_port(self, allocated_ports=None):
895903
# This port is allocated for other usage already,
896904
# so we shouldn't use it even if it's not in use right now.
897905
continue
898-
new_s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
906+
new_s = socket.socket(
907+
socket.AF_INET6 if is_ipv6(self._node_ip_address) else socket.AF_INET,
908+
socket.SOCK_STREAM,
909+
)
899910
try:
900911
new_s.bind(("", new_port))
901912
except OSError:

python/ray/_private/services.py

Lines changed: 15 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,13 @@
2121
# Ray modules
2222
import ray
2323
import ray._private.ray_constants as ray_constants
24-
from ray._common.network_utils import build_address, parse_address
24+
from ray._common.network_utils import (
25+
build_address,
26+
get_localhost_ip,
27+
is_ipv6,
28+
node_ip_address_from_perspective,
29+
parse_address,
30+
)
2531
from ray._private.ray_constants import RAY_NODE_IP_FILENAME
2632
from ray._private.resource_isolation_config import ResourceIsolationConfig
2733
from ray._raylet import GcsClient, GcsClientOptions
@@ -618,52 +624,21 @@ def resolve_ip_for_localhost(host: str):
618624
return host
619625

620626

621-
def node_ip_address_from_perspective(address: str):
622-
"""IP address by which the local node can be reached *from* the `address`.
623-
624-
Args:
625-
address: The IP address and port of any known live service on the
626-
network you care about.
627-
628-
Returns:
629-
The IP address by which the local node can be reached from the address.
630-
"""
631-
ip_address, port = parse_address(address)
632-
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
633-
try:
634-
# This command will raise an exception if there is no internet
635-
# connection.
636-
s.connect((ip_address, int(port)))
637-
node_ip_address = s.getsockname()[0]
638-
except OSError as e:
639-
node_ip_address = "127.0.0.1"
640-
# [Errno 101] Network is unreachable
641-
if e.errno == errno.ENETUNREACH:
642-
try:
643-
# try get node ip address from host name
644-
host_name = socket.getfqdn(socket.gethostname())
645-
node_ip_address = socket.gethostbyname(host_name)
646-
except Exception:
647-
pass
648-
finally:
649-
s.close()
650-
651-
return node_ip_address
652-
653-
654627
# NOTE: This API should not be used when you obtain the
655628
# IP address when ray.init is not called because
656629
# it cannot find the IP address if it is specified by
657630
# ray start --node-ip-address. You should instead use
658631
# get_cached_node_ip_address.
659-
def get_node_ip_address(address="8.8.8.8:53"):
632+
def get_node_ip_address(address=None):
660633
if ray._private.worker._global_node is not None:
661634
return ray._private.worker._global_node.node_ip_address
635+
662636
if not ray_constants.ENABLE_RAY_CLUSTER:
663637
# Use loopback IP as the local IP address to prevent bothersome
664638
# firewall popups on OSX and Windows.
665639
# https://github.com/ray-project/ray/issues/18730.
666-
return "127.0.0.1"
640+
return get_localhost_ip()
641+
667642
return node_ip_address_from_perspective(address)
668643

669644

@@ -1225,7 +1200,10 @@ def start_api_server(
12251200
port = ray_constants.DEFAULT_DASHBOARD_PORT
12261201
else:
12271202
port_retries = 0
1228-
port_test_socket = socket.socket()
1203+
port_test_socket = socket.socket(
1204+
socket.AF_INET6 if is_ipv6(host) else socket.AF_INET,
1205+
socket.SOCK_STREAM,
1206+
)
12291207
port_test_socket.setsockopt(
12301208
socket.SOL_SOCKET,
12311209
socket.SO_REUSEADDR,

python/ray/_private/tls_utils.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22
import os
33
import socket
44

5+
from ray._common.network_utils import (
6+
get_localhost_ip,
7+
node_ip_address_from_perspective,
8+
)
9+
510

611
def generate_self_signed_tls_certs():
712
"""Create self-signed key/cert pair for testing.
@@ -29,21 +34,13 @@ def generate_self_signed_tls_certs():
2934
).decode()
3035

3136
ray_interal = x509.Name([x509.NameAttribute(NameOID.COMMON_NAME, "ray-internal")])
32-
# This is the same logic used by the GCS server to acquire a
33-
# private/interal IP address to listen on. If we just use localhost +
34-
# 127.0.0.1 then we won't be able to connect to the GCS and will get
35-
# an error like "No match found for server name: 192.168.X.Y"
36-
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
37-
s.connect(("8.8.8.8", 80))
38-
private_ip_address = s.getsockname()[0]
39-
s.close()
4037
altnames = x509.SubjectAlternativeName(
4138
[
4239
x509.DNSName(
4340
socket.gethostbyname(socket.gethostname())
44-
), # Probably 127.0.0.1
45-
x509.DNSName("127.0.0.1"),
46-
x509.DNSName(private_ip_address), # 192.168.*.*
41+
), # Probably 127.0.0.1 or ::1
42+
x509.DNSName(get_localhost_ip()),
43+
x509.DNSName(node_ip_address_from_perspective()),
4744
x509.DNSName("localhost"),
4845
]
4946
)

0 commit comments

Comments
 (0)