diff --git a/src/IO/S3/URI.cpp b/src/IO/S3/URI.cpp index e2a10b787335..cfcdd21a5b90 100644 --- a/src/IO/S3/URI.cpp +++ b/src/IO/S3/URI.cpp @@ -47,7 +47,7 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax) /// Case when bucket name and key represented in the path of S3 URL. /// E.g. (https://s3.region.amazonaws.com/bucket-name/key) /// https://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html#path-style-access - static const RE2 path_style_pattern("^/([^/]*)/(.*)"); + static const RE2 path_style_pattern("^/([^/]*)(?:/?(.*))"); if (allow_archive_path_syntax) std::tie(uri_str, archive_pattern) = getURIAndArchivePattern(uri_); @@ -124,7 +124,6 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax) { endpoint = uri.getScheme() + "://" + name + endpoint_authority_from_uri; } - validateBucket(bucket, uri); if (!uri.getPath().empty()) { @@ -142,7 +141,6 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax) { is_virtual_hosted_style = false; endpoint = uri.getScheme() + "://" + uri.getAuthority(); - validateBucket(bucket, uri); } else { @@ -155,6 +153,9 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax) if (!uri.getPath().empty()) key = uri.getPath().substr(1); } + + validateBucket(bucket, uri); + validateKey(key, uri); } void URI::addRegionToURI(const std::string ®ion) @@ -175,6 +176,37 @@ void URI::validateBucket(const String & bucket, const Poco::URI & uri) !uri.empty() ? " (" + uri.toString() + ")" : ""); } +void URI::validateKey(const String & key, const Poco::URI & uri) +{ + auto onError = [&]() + { + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Invalid S3 key: {}{}", + quoteString(key), + !uri.empty() ? " (" + uri.toString() + ")" : ""); + }; + + + // this shouldn't happen ever because the regex should not catch this + if (key.size() == 1 && key[0] == '/') + { + onError(); + } + + // the current regex impl allows something like "bucket-name/////". + // bucket: bucket-name + // key: //// + // throw exception in case such thing is found + for (size_t i = 1; i < key.size(); i++) + { + if (key[i - 1] == '/' && key[i] == '/') + { + onError(); + } + } +} + std::pair> URI::getURIAndArchivePattern(const std::string & source) { size_t pos = source.find("::"); diff --git a/src/IO/S3/URI.h b/src/IO/S3/URI.h index c8d0b28cd150..d455cd1908f0 100644 --- a/src/IO/S3/URI.h +++ b/src/IO/S3/URI.h @@ -40,6 +40,7 @@ struct URI void addRegionToURI(const std::string & region); static void validateBucket(const std::string & bucket, const Poco::URI & uri); + static void validateKey(const std::string & key, const Poco::URI & uri); private: std::pair> getURIAndArchivePattern(const std::string & source); diff --git a/src/IO/S3/tests/gtest_s3_uri.cpp b/src/IO/S3/tests/gtest_s3_uri.cpp new file mode 100644 index 000000000000..a429c645fca3 --- /dev/null +++ b/src/IO/S3/tests/gtest_s3_uri.cpp @@ -0,0 +1,41 @@ +#include + +#include +#include "config.h" + + +#if USE_AWS_S3 + +TEST(IOTestS3URI, PathStyleNoKey) +{ + using namespace DB; + + auto uri_with_no_key_and_no_slash = S3::URI("https://s3.region.amazonaws.com/bucket-name"); + + ASSERT_EQ(uri_with_no_key_and_no_slash.bucket, "bucket-name"); + ASSERT_EQ(uri_with_no_key_and_no_slash.key, ""); + + auto uri_with_no_key_and_with_slash = S3::URI("https://s3.region.amazonaws.com/bucket-name/"); + + ASSERT_EQ(uri_with_no_key_and_with_slash.bucket, "bucket-name"); + ASSERT_EQ(uri_with_no_key_and_with_slash.key, ""); + + ASSERT_ANY_THROW(S3::URI("https://s3.region.amazonaws.com/bucket-name//")); +} + +TEST(IOTestS3URI, PathStyleWithKey) +{ + using namespace DB; + + auto uri_with_no_key_and_no_slash = S3::URI("https://s3.region.amazonaws.com/bucket-name/key"); + + ASSERT_EQ(uri_with_no_key_and_no_slash.bucket, "bucket-name"); + ASSERT_EQ(uri_with_no_key_and_no_slash.key, "key"); + + auto uri_with_no_key_and_with_slash = S3::URI("https://s3.region.amazonaws.com/bucket-name/key/key/key/key"); + + ASSERT_EQ(uri_with_no_key_and_with_slash.bucket, "bucket-name"); + ASSERT_EQ(uri_with_no_key_and_with_slash.key, "key/key/key/key"); +} + +#endif