Skip to content

Commit cbebd3f

Browse files
authored
[FLINK-39482][filesystem] Support configurable maxConnections in S3ClientProvider
1 parent d510e4e commit cbebd3f

6 files changed

Lines changed: 261 additions & 13 deletions

File tree

flink-filesystems/flink-s3-fs-native/src/main/java/org/apache/flink/fs/s3native/NativeS3BulkCopyHelper.java

Lines changed: 91 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package org.apache.flink.fs.s3native;
2020

2121
import org.apache.flink.annotation.Internal;
22+
import org.apache.flink.annotation.VisibleForTesting;
2223
import org.apache.flink.core.fs.ICloseableRegistry;
2324
import org.apache.flink.core.fs.PathsCopyingFileSystem;
2425

@@ -37,15 +38,18 @@
3738
import java.util.concurrent.CompletableFuture;
3839
import java.util.concurrent.ExecutionException;
3940

41+
import static org.apache.flink.util.Preconditions.checkArgument;
42+
4043
/**
4144
* Helper class for performing bulk S3 to local file system copies using S3TransferManager.
4245
*
4346
* <p><b>Concurrency Model:</b> Uses batch-based concurrency control with {@code
44-
* maxConcurrentCopies} to limit parallel downloads. The current implementation waits for each batch
45-
* to complete before starting the next batch. A future enhancement could use a bounded thread pool
46-
* (e.g., {@link java.util.concurrent.Semaphore} or bounded executor) to allow continuous submission
47-
* of new downloads as slots become available, which would provide better throughput by avoiding the
48-
* "slowest task in batch" bottleneck.
47+
* maxConcurrentCopies} to limit parallel downloads. The effective concurrency is clamped to the
48+
* HTTP connection pool size ({@code maxConnections}) to prevent connection pool exhaustion. The
49+
* current implementation waits for each batch to complete before starting the next batch. A future
50+
* enhancement could use a bounded thread pool (e.g., {@link java.util.concurrent.Semaphore} or
51+
* bounded executor) to allow continuous submission of new downloads as slots become available,
52+
* which would provide better throughput by avoiding the "slowest task in batch" bottleneck.
4953
*
5054
* <p><b>Retry Handling:</b> Relies on the S3TransferManager's built-in retry mechanism for
5155
* transient failures. If a download fails after retries:
@@ -70,10 +74,39 @@ class NativeS3BulkCopyHelper {
7074

7175
private final S3TransferManager transferManager;
7276
private final int maxConcurrentCopies;
77+
private final int maxConnections;
7378

74-
public NativeS3BulkCopyHelper(S3TransferManager transferManager, int maxConcurrentCopies) {
79+
/**
80+
* Creates a new bulk copy helper.
81+
*
82+
* @param transferManager the S3 transfer manager for async downloads
83+
* @param maxConcurrentCopies the requested maximum number of concurrent copy operations
84+
* @param maxConnections the HTTP connection pool size; if {@code maxConcurrentCopies} exceeds
85+
* this value, it is clamped down to prevent connection pool exhaustion
86+
*/
87+
NativeS3BulkCopyHelper(
88+
S3TransferManager transferManager, int maxConcurrentCopies, int maxConnections) {
89+
checkArgument(maxConcurrentCopies > 0, "maxConcurrentCopies must be positive");
90+
checkArgument(maxConnections > 0, "maxConnections must be positive");
7591
this.transferManager = transferManager;
76-
this.maxConcurrentCopies = maxConcurrentCopies;
92+
this.maxConnections = maxConnections;
93+
if (maxConcurrentCopies > maxConnections) {
94+
LOG.warn(
95+
"{} ({}) exceeds {} ({}). "
96+
+ "Clamping concurrent copies to {} to prevent connection pool exhaustion.",
97+
NativeS3FileSystemFactory.BULK_COPY_MAX_CONCURRENT.key(),
98+
maxConcurrentCopies,
99+
NativeS3FileSystemFactory.MAX_CONNECTIONS.key(),
100+
maxConnections,
101+
maxConnections);
102+
this.maxConcurrentCopies = maxConnections;
103+
} else {
104+
this.maxConcurrentCopies = maxConcurrentCopies;
105+
}
106+
}
107+
108+
int getMaxConcurrentCopies() {
109+
return maxConcurrentCopies;
77110
}
78111

79112
/**
@@ -97,9 +130,17 @@ public void copyFiles(
97130
return;
98131
}
99132

100-
LOG.info("Starting bulk copy of {} files using S3TransferManager", requests.size());
133+
int totalFiles = requests.size();
134+
int totalBatches = (totalFiles + maxConcurrentCopies - 1) / maxConcurrentCopies;
135+
LOG.info(
136+
"Starting bulk copy of {} files using S3TransferManager "
137+
+ "(batch size: {}, total batches: {})",
138+
totalFiles,
139+
maxConcurrentCopies,
140+
totalBatches);
101141

102142
List<CompletableFuture<CompletedCopy>> copyFutures = new ArrayList<>();
143+
int batchNumber = 0;
103144

104145
try {
105146
for (int i = 0; i < requests.size(); i++) {
@@ -113,12 +154,18 @@ public void copyFiles(
113154
}
114155

115156
if (copyFutures.size() >= maxConcurrentCopies || i == requests.size() - 1) {
157+
batchNumber++;
158+
LOG.debug(
159+
"Waiting for batch {}/{} ({} files)",
160+
batchNumber,
161+
totalBatches,
162+
copyFutures.size());
116163
waitForCopies(copyFutures);
117164
copyFutures.clear();
118165
}
119166
}
120167

121-
LOG.info("Completed bulk copy of {} files", requests.size());
168+
LOG.info("Completed bulk copy of {} files", totalFiles);
122169
} catch (Exception e) {
123170
if (!copyFutures.isEmpty()) {
124171
LOG.warn(
@@ -181,8 +228,42 @@ private void waitForCopies(List<CompletableFuture<CompletedCopy>> futures) throw
181228
Thread.currentThread().interrupt();
182229
throw new IOException("Bulk copy interrupted", e);
183230
} catch (ExecutionException e) {
184-
throw new IOException("Bulk copy failed", e.getCause());
231+
Throwable cause = e.getCause();
232+
if (isConnectionPoolExhausted(cause)) {
233+
throw new IOException(
234+
String.format(
235+
"S3 connection pool exhausted during bulk copy. "
236+
+ "The configured connection pool size (%d) could not serve "
237+
+ "the concurrent download requests (%d). "
238+
+ "Consider reducing '%s' or increasing '%s'.",
239+
maxConnections,
240+
maxConcurrentCopies,
241+
NativeS3FileSystemFactory.BULK_COPY_MAX_CONCURRENT.key(),
242+
NativeS3FileSystemFactory.MAX_CONNECTIONS.key()),
243+
cause);
244+
}
245+
throw new IOException("Bulk copy failed", cause);
246+
}
247+
}
248+
249+
/**
250+
* Checks whether a failure was caused by HTTP connection pool exhaustion.
251+
*
252+
* <p>Walks the causal chain looking for the SDK's characteristic message about connection
253+
* acquire timeouts. This detection is deliberately broad (substring match on the message) to
254+
* remain resilient to minor SDK wording changes across versions.
255+
*/
256+
@VisibleForTesting
257+
static boolean isConnectionPoolExhausted(Throwable throwable) {
258+
Throwable current = throwable;
259+
while (current != null) {
260+
String message = current.getMessage();
261+
if (message != null && message.contains("Acquire operation took longer than")) {
262+
return true;
263+
}
264+
current = current.getCause();
185265
}
266+
return false;
186267
}
187268

188269
/**

flink-filesystems/flink-s3-fs-native/src/main/java/org/apache/flink/fs/s3native/NativeS3FileSystem.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,12 @@ S3ClientProvider getClientProvider() {
170170
return clientProvider;
171171
}
172172

173+
@VisibleForTesting
174+
@Nullable
175+
NativeS3BulkCopyHelper getBulkCopyHelper() {
176+
return bulkCopyHelper;
177+
}
178+
173179
@Override
174180
public URI getUri() {
175181
return uri;

flink-filesystems/flink-s3-fs-native/src/main/java/org/apache/flink/fs/s3native/NativeS3FileSystemFactory.java

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,15 @@ public class NativeS3FileSystemFactory implements FileSystemFactory {
122122
.defaultValue(true)
123123
.withDescription("Enable bulk copy operations using S3TransferManager");
124124

125+
public static final ConfigOption<Integer> MAX_CONNECTIONS =
126+
ConfigOptions.key("s3.connection.max")
127+
.intType()
128+
.defaultValue(50)
129+
.withDescription(
130+
"Maximum number of HTTP connections in the S3 client connection pool. "
131+
+ "Applies to both the sync client (Apache HTTP) and the async client (Netty). "
132+
+ "Must be at least as large as 's3.bulk-copy.max-concurrent'.");
133+
125134
public static final ConfigOption<Integer> BULK_COPY_MAX_CONCURRENT =
126135
ConfigOptions.key("s3.bulk-copy.max-concurrent")
127136
.intType()
@@ -348,13 +357,21 @@ public FileSystem create(URI fsUri) throws IOException {
348357
readBufferSize);
349358
}
350359

360+
final int maxConnections = config.get(MAX_CONNECTIONS);
361+
Preconditions.checkArgument(
362+
maxConnections > 0,
363+
"'%s' must be a positive integer, but was %s",
364+
MAX_CONNECTIONS.key(),
365+
maxConnections);
366+
351367
S3ClientProvider clientProvider =
352368
S3ClientProvider.builder()
353369
.accessKey(accessKey)
354370
.secretKey(secretKey)
355371
.region(region)
356372
.endpoint(endpoint)
357373
.pathStyleAccess(pathStyleAccess)
374+
.maxConnections(maxConnections)
358375
.connectionTimeout(config.get(CONNECTION_TIMEOUT))
359376
.socketTimeout(config.get(SOCKET_TIMEOUT))
360377
.connectionMaxIdleTime(config.get(CONNECTION_MAX_IDLE_TIME))
@@ -371,10 +388,17 @@ public FileSystem create(URI fsUri) throws IOException {
371388

372389
NativeS3BulkCopyHelper bulkCopyHelper = null;
373390
if (config.get(BULK_COPY_ENABLED)) {
391+
final int bulkCopyMaxConcurrent = config.get(BULK_COPY_MAX_CONCURRENT);
392+
Preconditions.checkArgument(
393+
bulkCopyMaxConcurrent > 0,
394+
"'%s' must be a positive integer, but was %s",
395+
BULK_COPY_MAX_CONCURRENT.key(),
396+
bulkCopyMaxConcurrent);
374397
bulkCopyHelper =
375398
new NativeS3BulkCopyHelper(
376399
clientProvider.getTransferManager(),
377-
config.get(BULK_COPY_MAX_CONCURRENT));
400+
bulkCopyMaxConcurrent,
401+
maxConnections);
378402
}
379403

380404
return new NativeS3FileSystem(

flink-filesystems/flink-s3-fs-native/src/main/java/org/apache/flink/fs/s3native/S3ClientProvider.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,9 @@ public S3ClientProvider build() {
389389
NettyNioAsyncHttpClient.builder()
390390
.maxConcurrency(maxConnections)
391391
.connectionTimeout(connectionTimeout)
392-
.readTimeout(socketTimeout))
392+
.readTimeout(socketTimeout)
393+
.connectionAcquisitionTimeout(
394+
connectionTimeout))
393395
.overrideConfiguration(overrideConfig)
394396
.endpointOverride(endpointUri)
395397
.build())

flink-filesystems/flink-s3-fs-native/src/test/java/org/apache/flink/fs/s3native/NativeS3BulkCopyHelperTest.java

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,24 @@
2020

2121
import org.junit.jupiter.api.Test;
2222
import org.junit.jupiter.params.ParameterizedTest;
23+
import org.junit.jupiter.params.provider.Arguments;
2324
import org.junit.jupiter.params.provider.CsvSource;
25+
import org.junit.jupiter.params.provider.MethodSource;
26+
import software.amazon.awssdk.core.exception.SdkClientException;
27+
28+
import java.util.Collections;
29+
import java.util.concurrent.TimeoutException;
30+
import java.util.stream.Stream;
2431

2532
import static org.assertj.core.api.Assertions.assertThat;
2633
import static org.assertj.core.api.Assertions.assertThatNullPointerException;
2734

2835
/** Tests for {@link NativeS3BulkCopyHelper}. */
2936
class NativeS3BulkCopyHelperTest {
3037

31-
private static final NativeS3BulkCopyHelper helper = new NativeS3BulkCopyHelper(null, 1);
38+
private static final NativeS3BulkCopyHelper helper = new NativeS3BulkCopyHelper(null, 1, 1);
39+
40+
// --- URI parsing tests ---
3241

3342
@ParameterizedTest
3443
@CsvSource({
@@ -105,4 +114,46 @@ void testExtractKeyVeryLongPath() {
105114
path.append("file.txt");
106115
assertThat(helper.extractKey("s3://bucket/" + path)).isEqualTo(path.toString());
107116
}
117+
118+
private static Stream<Arguments> connectionPoolExhaustedCases() {
119+
return Stream.of(
120+
Arguments.of(
121+
"direct message match",
122+
SdkClientException.builder()
123+
.message(
124+
"Unable to execute HTTP request: "
125+
+ "Acquire operation took longer than the configured maximum time.")
126+
.build(),
127+
true),
128+
Arguments.of(
129+
"nested causal chain",
130+
SdkClientException.builder()
131+
.message("Unable to execute HTTP request")
132+
.cause(
133+
new RuntimeException(
134+
"channel acquisition failed",
135+
new TimeoutException(
136+
"Acquire operation took longer than 10000 milliseconds.")))
137+
.build(),
138+
true),
139+
Arguments.of(
140+
"unrelated error",
141+
SdkClientException.builder().message("Access Denied").build(),
142+
false),
143+
Arguments.of("null message", new RuntimeException((String) null), false),
144+
Arguments.of("null throwable", null, false));
145+
}
146+
147+
@ParameterizedTest(name = "{0}")
148+
@MethodSource("connectionPoolExhaustedCases")
149+
void testConnectionPoolExhaustedDetection(
150+
String description, Throwable throwable, boolean expected) {
151+
assertThat(NativeS3BulkCopyHelper.isConnectionPoolExhausted(throwable)).isEqualTo(expected);
152+
}
153+
154+
@Test
155+
void testEmptyRequestListIsNoOp() throws Exception {
156+
NativeS3BulkCopyHelper noOpHelper = new NativeS3BulkCopyHelper(null, 16, 50);
157+
noOpHelper.copyFiles(Collections.emptyList(), null);
158+
}
108159
}

0 commit comments

Comments
 (0)