KAFKA-19478 [3/N]: Use heaps to discover the least loaded process (apache#20172)

lucasbru · web-flow · commit 6247fd9eb36d · 2025-09-03T17:13:01.000+02:00
The original implementation uses a linear search to find the least
loaded process in O(n), and we can replace this by look-ups in a heap is
O(log(n)), as described below

Active tasks: For active tasks, we can do exactly the same assignment as
in the original algorithm by first building a heap (by load) of all
processes. When we assign a task, we pick the head off the heap, assign
the task to it, update the load, and re-insert it into the heap in
O(log(n)).

Standby tasks: For standby tasks, we cannot do this optimization
directly, because of the order in which we assign tasks:

1. We first try to assign task A to a process that previously owned A.
2. If we did not find such a process, we assign A to the least loaded
node.
3. We now try to assign task B to a process that previously owned B
4. If we did not find such a process, we assign B to the least loaded
node
   ...

The problem is that we cannot efficiently keep a heap (by load)
throughout this process, because finding and removing process that
previously owned A (and B and…) in the heap is O(n). We therefore need
to change the order of evaluation to be able to use a heap:

1. Try to assign all tasks A, B.. to a process that previously owned the
task
2. Build a heap.
3. Assign all remaining tasks to the least-loaded process that does not
yet own the task. Since at most NumStandbyReplicas already own the task,
we can do it by removing up to NumStandbyReplicas from the top of the
heap in O(log(n)), so we get O(log(NumProcesses)*NumStandbyReplicas).

Note that the change in order changes the resulting standby assignments
(although this difference does not show up in the existing unit tests).
I would argue that the new order of assignment will actually yield
better assignments, since the assignment will be more sticky, which has
the potential to reduce the amount of store we have to restore from the
changelog topic after assingments.

In our worst-performing benchmark, this improves the runtime by ~107x.

Reviewers: Bill Bejeck&lt;bbejeck@apache.org&gt;
diff --git a/group-coordinator/src/main/java/org/apache/kafka/coordinator/group/streams/assignor/StickyTaskAssignor.java b/group-coordinator/src/main/java/org/apache/kafka/coordinator/group/streams/assignor/StickyTaskAssignor.java
diff --git a/group-coordinator/src/test/java/org/apache/kafka/coordinator/group/streams/assignor/StickyTaskAssignorTest.java b/group-coordinator/src/test/java/org/apache/kafka/coordinator/group/streams/assignor/StickyTaskAssignorTest.java
@@ -833,6 +833,265 @@ public void shouldAssignTasksToNewClientWithoutFlippingAssignmentBetweenExisting
         assertEquals(2, getAllActiveTaskIds(result, "newMember").size());
     }
 
+    @Test
+    public void shouldHandleLargeNumberOfTasksWithStandbyAssignment() {
+        final int numTasks = 100;
+        final int numClients = 5;
+        final int numStandbyReplicas = 2;
+        
+        Map<String, AssignmentMemberSpec> members = new HashMap<>();
+        for (int i = 0; i < numClients; i++) {
+            members.put("member" + i, createAssignmentMemberSpec("process" + i));
+        }
+
+        GroupAssignment result = assignor.assign(
+            new GroupSpecImpl(members, mkMap(mkEntry(NUM_STANDBY_REPLICAS_CONFIG, String.valueOf(numStandbyReplicas)))),
+            new TopologyDescriberImpl(numTasks, true, List.of("test-subtopology"))
+        );
+
+        // Verify all active tasks are assigned
+        Set<Integer> allActiveTasks = new HashSet<>();
+        for (String memberId : result.members().keySet()) {
+            List<Integer> memberActiveTasks = getAllActiveTaskIds(result, memberId);
+            allActiveTasks.addAll(memberActiveTasks);
+        }
+        assertEquals(numTasks, allActiveTasks.size());
+
+        // Verify standby tasks are assigned (should be numTasks * numStandbyReplicas total)
+        Set<Integer> allStandbyTasks = new HashSet<>();
+        for (String memberId : result.members().keySet()) {
+            List<Integer> memberStandbyTasks = getAllStandbyTaskIds(result, memberId);
+            allStandbyTasks.addAll(memberStandbyTasks);
+        }
+        // With 5 clients and 2 standby replicas, we should have at least some standby tasks
+        assertTrue(allStandbyTasks.size() > 0, "Should have some standby tasks assigned");
+        // Maximum possible = numTasks * min(numStandbyReplicas, numClients - 1) = 100 * 2 = 200
+        int maxPossibleStandbyTasks = numTasks * Math.min(numStandbyReplicas, numClients - 1);
+        assertTrue(allStandbyTasks.size() <= maxPossibleStandbyTasks, 
+            "Should not exceed maximum possible standby tasks: " + maxPossibleStandbyTasks);
+
+        // Verify no client has both active and standby for the same task
+        for (String memberId : result.members().keySet()) {
+            Set<Integer> memberActiveTasks = new HashSet<>(getAllActiveTaskIds(result, memberId));
+            Set<Integer> memberStandbyTasks = new HashSet<>(getAllStandbyTaskIds(result, memberId));
+            memberActiveTasks.retainAll(memberStandbyTasks);
+            assertTrue(memberActiveTasks.isEmpty(), "Client " + memberId + " has both active and standby for same task");
+        }
+
+        // Verify load distribution is reasonable
+        int minActiveTasks = Integer.MAX_VALUE;
+        int maxActiveTasks = 0;
+        for (String memberId : result.members().keySet()) {
+            int activeTaskCount = getAllActiveTaskCount(result, memberId);
+            minActiveTasks = Math.min(minActiveTasks, activeTaskCount);
+            maxActiveTasks = Math.max(maxActiveTasks, activeTaskCount);
+        }
+        // With 100 tasks and 5 clients, each should have 20 tasks
+        assertEquals(20, minActiveTasks);
+        assertEquals(20, maxActiveTasks);
+        
+        // Verify standby task distribution is reasonable
+        int minStandbyTasks = Integer.MAX_VALUE;
+        int maxStandbyTasks = 0;
+        for (String memberId : result.members().keySet()) {
+            int standbyTaskCount = getAllStandbyTaskIds(result, memberId).size();
+            minStandbyTasks = Math.min(minStandbyTasks, standbyTaskCount);
+            maxStandbyTasks = Math.max(maxStandbyTasks, standbyTaskCount);
+        }
+        // Each client should have some standby tasks, but not necessarily equal distribution
+        assertTrue(minStandbyTasks >= 0);
+        assertTrue(maxStandbyTasks > 0);
+    }
+
+    @Test
+    public void shouldHandleOddNumberOfClientsWithStandbyTasks() {
+        // Test with odd number of clients (7) and even number of tasks (14)
+        final int numTasks = 14;
+        final int numClients = 7;
+        final int numStandbyReplicas = 1;
+        
+        Map<String, AssignmentMemberSpec> members = new HashMap<>();
+        for (int i = 0; i < numClients; i++) {
+            members.put("member" + i, createAssignmentMemberSpec("process" + i));
+        }
+
+        GroupAssignment result = assignor.assign(
+            new GroupSpecImpl(members, mkMap(mkEntry(NUM_STANDBY_REPLICAS_CONFIG, String.valueOf(numStandbyReplicas)))),
+            new TopologyDescriberImpl(numTasks, true, List.of("test-subtopology"))
+        );
+
+        // Verify all active tasks are assigned
+        Set<Integer> allActiveTasks = new HashSet<>();
+        for (String memberId : result.members().keySet()) {
+            List<Integer> memberActiveTasks = getAllActiveTaskIds(result, memberId);
+            allActiveTasks.addAll(memberActiveTasks);
+        }
+        assertEquals(numTasks, allActiveTasks.size());
+
+        // Verify standby tasks are assigned
+        Set<Integer> allStandbyTasks = new HashSet<>();
+        for (String memberId : result.members().keySet()) {
+            List<Integer> memberStandbyTasks = getAllStandbyTaskIds(result, memberId);
+            allStandbyTasks.addAll(memberStandbyTasks);
+        }
+        assertEquals(numTasks * numStandbyReplicas, allStandbyTasks.size());
+
+        // With 14 tasks and 7 clients, each client should have 2 active tasks
+        int expectedTasksPerClient = numTasks / numClients; // 14 / 7 = 2
+        int remainder = numTasks % numClients; // 14 % 7 = 0
+        
+        int clientsWithExpectedTasks = 0;
+        int clientsWithOneMoreTask = 0;
+        for (String memberId : result.members().keySet()) {
+            int activeTaskCount = getAllActiveTaskCount(result, memberId);
+            if (activeTaskCount == expectedTasksPerClient) {
+                clientsWithExpectedTasks++;
+            } else if (activeTaskCount == expectedTasksPerClient + 1) {
+                clientsWithOneMoreTask++;
+            }
+        }
+        assertEquals(numClients - remainder, clientsWithExpectedTasks); // 7 clients should have 2 tasks
+        assertEquals(remainder, clientsWithOneMoreTask); // 0 clients should have 3 tasks
+    }
+
+    @Test
+    public void shouldHandleHighStandbyReplicaCount() {
+        // Test with high number of standby replicas (5) and limited clients (3)
+        final int numTasks = 6;
+        final int numClients = 3;
+        final int numStandbyReplicas = 5;
+        
+        Map<String, AssignmentMemberSpec> members = new HashMap<>();
+        for (int i = 0; i < numClients; i++) {
+            members.put("member" + i, createAssignmentMemberSpec("process" + i));
+        }
+
+        GroupAssignment result = assignor.assign(
+            new GroupSpecImpl(members, mkMap(mkEntry(NUM_STANDBY_REPLICAS_CONFIG, String.valueOf(numStandbyReplicas)))),
+            new TopologyDescriberImpl(numTasks, true, List.of("test-subtopology"))
+        );
+
+        // Verify all active tasks are assigned
+        Set<Integer> allActiveTasks = new HashSet<>();
+        for (String memberId : result.members().keySet()) {
+            List<Integer> memberActiveTasks = getAllActiveTaskIds(result, memberId);
+            allActiveTasks.addAll(memberActiveTasks);
+        }
+        assertEquals(numTasks, allActiveTasks.size());
+
+        // With only 3 clients and 5 standby replicas, not all standby replicas can be assigned
+        // since each client can only hold standby tasks for tasks it doesn't have active
+        Set<Integer> allStandbyTasks = new HashSet<>();
+        for (String memberId : result.members().keySet()) {
+            List<Integer> memberStandbyTasks = getAllStandbyTaskIds(result, memberId);
+            allStandbyTasks.addAll(memberStandbyTasks);
+        }
+        
+        // Maximum possible = numTasks * min(numStandbyReplicas, numClients - 1) = 6 * 2 = 12
+        int maxPossibleStandbyTasks = numTasks * Math.min(numStandbyReplicas, numClients - 1);
+        assertTrue(allStandbyTasks.size() <= maxPossibleStandbyTasks);
+        assertTrue(allStandbyTasks.size() > 0); // Should assign at least some standby tasks
+    }
+
+    @Test
+    public void shouldHandleLargeNumberOfSubtopologiesWithStandbyTasks() {
+        // Test with many subtopologies (10) each with different numbers of tasks
+        final int numSubtopologies = 10;
+        final int numClients = 4;
+        final int numStandbyReplicas = 1;
+        
+        List<String> subtopologies = new ArrayList<>();
+        for (int i = 0; i < numSubtopologies; i++) {
+            subtopologies.add("subtopology-" + i);
+        }
+        
+        Map<String, AssignmentMemberSpec> members = new HashMap<>();
+        for (int i = 0; i < numClients; i++) {
+            members.put("member" + i, createAssignmentMemberSpec("process" + i));
+        }
+
+        GroupAssignment result = assignor.assign(
+            new GroupSpecImpl(members, mkMap(mkEntry(NUM_STANDBY_REPLICAS_CONFIG, String.valueOf(numStandbyReplicas)))),
+            new TopologyDescriberImpl(5, true, subtopologies) // 5 tasks per subtopology
+        );
+
+        // Verify all subtopologies have tasks assigned
+        Set<String> subtopologiesWithTasks = new HashSet<>();
+        for (String memberId : result.members().keySet()) {
+            MemberAssignment member = result.members().get(memberId);
+            subtopologiesWithTasks.addAll(member.activeTasks().keySet());
+        }
+        assertEquals(numSubtopologies, subtopologiesWithTasks.size());
+
+        // Verify standby tasks are assigned across subtopologies
+        Set<String> subtopologiesWithStandbyTasks = new HashSet<>();
+        for (String memberId : result.members().keySet()) {
+            MemberAssignment member = result.members().get(memberId);
+            subtopologiesWithStandbyTasks.addAll(member.standbyTasks().keySet());
+        }
+        assertEquals(numSubtopologies, subtopologiesWithStandbyTasks.size());
+    }
+
+    @Test
+    public void shouldHandleEdgeCaseWithSingleClientAndMultipleStandbyReplicas() {
+        // Test edge case: single client with multiple standby replicas
+        final int numTasks = 10;
+        final int numStandbyReplicas = 3;
+        
+        Map<String, AssignmentMemberSpec> members = mkMap(
+            mkEntry("member1", createAssignmentMemberSpec("process1"))
+        );
+
+        GroupAssignment result = assignor.assign(
+            new GroupSpecImpl(members, mkMap(mkEntry(NUM_STANDBY_REPLICAS_CONFIG, String.valueOf(numStandbyReplicas)))),
+            new TopologyDescriberImpl(numTasks, true, List.of("test-subtopology"))
+        );
+
+        // Single client should get all active tasks
+        assertEquals(numTasks, getAllActiveTaskCount(result, "member1"));
+        
+        // No standby tasks should be assigned since there's only one client
+        // (standby tasks can't be assigned to the same client as active tasks)
+        assertTrue(getAllStandbyTaskIds(result, "member1").isEmpty());
+    }
+
+    @Test
+    public void shouldHandleEdgeCaseWithMoreStandbyReplicasThanAvailableClients() {
+        // Test edge case: more standby replicas than available clients
+        final int numTasks = 4;
+        final int numClients = 2;
+        final int numStandbyReplicas = 5; // More than available clients
+        
+        Map<String, AssignmentMemberSpec> members = new HashMap<>();
+        for (int i = 0; i < numClients; i++) {
+            members.put("member" + i, createAssignmentMemberSpec("process" + i));
+        }
+
+        GroupAssignment result = assignor.assign(
+            new GroupSpecImpl(members, mkMap(mkEntry(NUM_STANDBY_REPLICAS_CONFIG, String.valueOf(numStandbyReplicas)))),
+            new TopologyDescriberImpl(numTasks, true, List.of("test-subtopology"))
+        );
+
+        // Verify all active tasks are assigned
+        Set<Integer> allActiveTasks = new HashSet<>();
+        for (String memberId : result.members().keySet()) {
+            List<Integer> memberActiveTasks = getAllActiveTaskIds(result, memberId);
+            allActiveTasks.addAll(memberActiveTasks);
+        }
+        assertEquals(numTasks, allActiveTasks.size());
+
+        // With only 2 clients, maximum standby tasks per task = 1 (since each client has active tasks)
+        Set<Integer> allStandbyTasks = new HashSet<>();
+        for (String memberId : result.members().keySet()) {
+            List<Integer> memberStandbyTasks = getAllStandbyTaskIds(result, memberId);
+            allStandbyTasks.addAll(memberStandbyTasks);
+        }
+        
+        // Maximum possible = numTasks * 1 = 4
+        assertEquals(numTasks, allStandbyTasks.size());
+    }
+
+
     private int getAllActiveTaskCount(GroupAssignment result, String... memberIds) {
         int size = 0;
         for (String memberId : memberIds) {
diff --git a/streams/integration-tests/src/test/java/org/apache/kafka/streams/integration/RestoreIntegrationTest.java b/streams/integration-tests/src/test/java/org/apache/kafka/streams/integration/RestoreIntegrationTest.java
@@ -573,7 +573,7 @@ public void shouldRecycleStateFromStandbyTaskPromotedToActiveTaskAndNotRestore(f
         createStateForRestoration(inputStream, 0);
 
         if (useNewProtocol) {
-            CLUSTER.setStandbyReplicas(appId, 1);
+            CLUSTER.setGroupStandbyReplicas(appId, 1);
         }
 
         final Properties props1 = props(stateUpdaterEnabled);
diff --git a/streams/integration-tests/src/test/java/org/apache/kafka/streams/integration/SmokeTestDriverIntegrationTest.java b/streams/integration-tests/src/test/java/org/apache/kafka/streams/integration/SmokeTestDriverIntegrationTest.java
@@ -140,15 +140,19 @@ public void shouldWorkWithRebalance(
 
 
         final Properties props = new Properties();
+        final String appId = safeUniqueTestName(testInfo);
         props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers);
-        props.put(StreamsConfig.APPLICATION_ID_CONFIG, safeUniqueTestName(testInfo));
+        props.put(StreamsConfig.APPLICATION_ID_CONFIG, appId);
         props.put(InternalConfig.STATE_UPDATER_ENABLED, stateUpdaterEnabled);
         props.put(InternalConfig.PROCESSING_THREADS_ENABLED, processingThreadsEnabled);
-        // decrease the session timeout so that we can trigger the rebalance soon after old client left closed
-        props.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG, 10000);
-        props.put(ConsumerConfig.HEARTBEAT_INTERVAL_MS_CONFIG, 500);
         if (streamsProtocolEnabled) {
             props.put(StreamsConfig.GROUP_PROTOCOL_CONFIG, GroupProtocol.STREAMS.name().toLowerCase(Locale.getDefault()));
+            // decrease the session timeout so that we can trigger the rebalance soon after old client left closed
+            CLUSTER.setGroupSessionTimeout(appId, 10000);
+            CLUSTER.setGroupHeartbeatTimeout(appId, 1000);
+        } else {
+            // decrease the session timeout so that we can trigger the rebalance soon after old client left closed
+            props.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG, 10000);
         }
 
         // cycle out Streams instances as long as the test is running.
diff --git a/streams/integration-tests/src/test/java/org/apache/kafka/streams/integration/StandbyTaskCreationIntegrationTest.java b/streams/integration-tests/src/test/java/org/apache/kafka/streams/integration/StandbyTaskCreationIntegrationTest.java
@@ -99,7 +99,7 @@ private Properties streamsConfiguration(final boolean streamsProtocolEnabled) {
         streamsConfiguration.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.IntegerSerde.class);
         if (streamsProtocolEnabled) {
             streamsConfiguration.put(StreamsConfig.GROUP_PROTOCOL_CONFIG, GroupProtocol.STREAMS.name().toLowerCase(Locale.getDefault()));
-            CLUSTER.setStandbyReplicas("app-" + safeTestName, 1);
+            CLUSTER.setGroupStandbyReplicas("app-" + safeTestName, 1);
         } else {
             streamsConfiguration.put(StreamsConfig.NUM_STANDBY_REPLICAS_CONFIG, 1);
         }
diff --git a/streams/integration-tests/src/test/java/org/apache/kafka/streams/integration/utils/EmbeddedKafkaCluster.java b/streams/integration-tests/src/test/java/org/apache/kafka/streams/integration/utils/EmbeddedKafkaCluster.java
@@ -401,6 +401,8 @@ public KafkaConsumer<byte[], byte[]> createConsumerAndSubscribeTo(final Map<Stri
 
     private void addDefaultBrokerPropsIfAbsent(final Properties brokerConfig) {
         brokerConfig.putIfAbsent(CleanerConfig.LOG_CLEANER_DEDUPE_BUFFER_SIZE_PROP, 2 * 1024 * 1024L);
+        brokerConfig.putIfAbsent(GroupCoordinatorConfig.STREAMS_GROUP_MIN_SESSION_TIMEOUT_MS_CONFIG, "100");
+        brokerConfig.putIfAbsent(GroupCoordinatorConfig.STREAMS_GROUP_MIN_HEARTBEAT_INTERVAL_MS_CONFIG, "100");
         brokerConfig.putIfAbsent(GroupCoordinatorConfig.GROUP_MIN_SESSION_TIMEOUT_MS_CONFIG, "0");
         brokerConfig.putIfAbsent(GroupCoordinatorConfig.GROUP_INITIAL_REBALANCE_DELAY_MS_CONFIG, "0");
         brokerConfig.putIfAbsent(GroupCoordinatorConfig.OFFSETS_TOPIC_PARTITIONS_CONFIG, "5");
@@ -439,7 +441,33 @@ public Properties getLogConfig(final String topic) {
         }
     }
 
-    public void setStandbyReplicas(final String groupId, final int numStandbyReplicas) {
+    public void setGroupSessionTimeout(final String groupId, final int sessionTimeoutMs) {
+        try (final Admin adminClient = createAdminClient()) {
+            adminClient.incrementalAlterConfigs(
+                Map.of(
+                    new ConfigResource(ConfigResource.Type.GROUP, groupId),
+                    List.of(new AlterConfigOp(new ConfigEntry(GroupConfig.STREAMS_SESSION_TIMEOUT_MS_CONFIG, String.valueOf(sessionTimeoutMs)), AlterConfigOp.OpType.SET))
+                )
+            ).all().get();
+        } catch (final InterruptedException | ExecutionException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public void setGroupHeartbeatTimeout(final String groupId, final int heartbeatTimeoutMs) {
+        try (final Admin adminClient = createAdminClient()) {
+            adminClient.incrementalAlterConfigs(
+                Map.of(
+                    new ConfigResource(ConfigResource.Type.GROUP, groupId),
+                    List.of(new AlterConfigOp(new ConfigEntry(GroupConfig.STREAMS_HEARTBEAT_INTERVAL_MS_CONFIG, String.valueOf(heartbeatTimeoutMs)), AlterConfigOp.OpType.SET))
+                )
+            ).all().get();
+        } catch (final InterruptedException | ExecutionException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public void setGroupStandbyReplicas(final String groupId, final int numStandbyReplicas) {
         try (final Admin adminClient = createAdminClient()) {
             adminClient.incrementalAlterConfigs(
                 Map.of(

Original file line number	Diff line number	Diff line change
`@@ -573,7 +573,7 @@ public void shouldRecycleStateFromStandbyTaskPromotedToActiveTaskAndNotRestore(f`
`573`	`573`	`createStateForRestoration(inputStream, 0);`
`574`	`574`
`575`	`575`	`if (useNewProtocol) {`
`576`		`- CLUSTER.setStandbyReplicas(appId, 1);`
	`576`	`+ CLUSTER.setGroupStandbyReplicas(appId, 1);`
`577`	`577`	`}`
`578`	`578`
`579`	`579`	`final Properties props1 = props(stateUpdaterEnabled);`
Original file line number	Diff line number	Diff line change
`@@ -99,7 +99,7 @@ private Properties streamsConfiguration(final boolean streamsProtocolEnabled) {`
`99`	`99`	`streamsConfiguration.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.IntegerSerde.class);`
`100`	`100`	`if (streamsProtocolEnabled) {`
`101`	`101`	`streamsConfiguration.put(StreamsConfig.GROUP_PROTOCOL_CONFIG, GroupProtocol.STREAMS.name().toLowerCase(Locale.getDefault()));`
`102`		`- CLUSTER.setStandbyReplicas("app-" + safeTestName, 1);`
	`102`	`+ CLUSTER.setGroupStandbyReplicas("app-" + safeTestName, 1);`
`103`	`103`	`} else {`
`104`	`104`	`streamsConfiguration.put(StreamsConfig.NUM_STANDBY_REPLICAS_CONFIG, 1);`
`105`	`105`	`}`