Skip to content

Commit ca8ad9c

Browse files
Auto Exit MM with timeout
1 parent e7ada09 commit ca8ad9c

File tree

5 files changed

+127
-8
lines changed

5 files changed

+127
-8
lines changed

helix-core/src/main/java/org/apache/helix/HelixAdmin.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,19 @@ void autoEnableMaintenanceMode(String clusterName, boolean enabled, String reaso
422422
void manuallyEnableMaintenanceMode(String clusterName, boolean enabled, String reason,
423423
Map<String, String> customFields);
424424

425+
/**
426+
* Manually enable maintenance mode with timeout. To be called by the REST client.
427+
* The cluster will automatically exit maintenance mode after the specified timeout.
428+
* @param clusterName the cluster name
429+
* @param enabled if true, enter maintenance mode; if false, exit maintenance mode
430+
* @param reason reason to enter maintenance mode
431+
* @param timeout time in milliseconds after which maintenance mode should be exited automatically.
432+
* Only applicable when enabled is true. Set to -1 for no automatic exit.
433+
* @param customFields user-specified KV mappings to be stored in the ZNode
434+
*/
435+
void manuallyEnableMaintenanceModeWithTimeout(String clusterName, boolean enabled, String reason,
436+
long timeout, Map<String, String> customFields);
437+
425438
/**
426439
* Check specific cluster is in maintenance mode or not
427440
* @param clusterName the cluster name

helix-core/src/main/java/org/apache/helix/controller/stages/MaintenanceRecoveryStage.java

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,17 +63,33 @@ public void execute(final ClusterEvent event) throws Exception {
6363
// Check for the maintenance signal
6464
// If it was entered manually or the signal is null (which shouldn't happen), skip this stage
6565
MaintenanceSignal maintenanceSignal = cache.getMaintenanceSignal();
66-
if (maintenanceSignal == null || maintenanceSignal
67-
.getTriggeringEntity() != MaintenanceSignal.TriggeringEntity.CONTROLLER) {
66+
if (maintenanceSignal == null) {
6867
return;
6968
}
70-
7169
HelixManager manager = event.getAttribute(AttributeName.helixmanager.name());
7270
if (manager == null || !manager.isConnected()) {
7371
LogUtil.logInfo(LOG, _eventId,
7472
"MaintenanceRecoveryStage failed due to HelixManager being null or not connected!");
7573
return;
7674
}
75+
// Check if this is a user-triggered maintenance mode with an end time
76+
if (maintenanceSignal.getTriggeringEntity() == MaintenanceSignal.TriggeringEntity.USER) {
77+
long endTime = maintenanceSignal.getEndTime();
78+
// If endTime is set and the current time has passed the end time, exit maintenance mode
79+
if (endTime > 0 && System.currentTimeMillis() >= endTime) {
80+
String reason = String.format(
81+
"Timeout-based exit from maintenance mode for cluster %s; End time %d has passed.",
82+
event.getClusterName(), endTime);
83+
84+
manager.getClusterManagmentTool().manuallyEnableMaintenanceMode(manager.getClusterName(), false,
85+
reason, null);
86+
cache.setMaintenanceSignalChanged(); // Set the flag so we do not double enable/disable
87+
LogUtil.logInfo(LOG, _eventId, reason);
88+
return;
89+
}
90+
// Not yet time to exit, or no end time set
91+
return;
92+
}
7793

7894
// At this point, the cluster entered maintenance mode automatically. Retrieve the
7995
// auto-triggering reason

helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1179,7 +1179,15 @@ public void manuallyEnableMaintenanceMode(String clusterName, boolean enabled, S
11791179
Map<String, String> customFields) {
11801180
processMaintenanceMode(clusterName, enabled, reason,
11811181
MaintenanceSignal.AutoTriggerReason.NOT_APPLICABLE, customFields,
1182-
MaintenanceSignal.TriggeringEntity.USER);
1182+
MaintenanceSignal.TriggeringEntity.USER, -1);
1183+
}
1184+
1185+
@Override
1186+
public void manuallyEnableMaintenanceModeWithTimeout(String clusterName, boolean enabled,
1187+
String reason, long timeout, Map<String, String> customFields) {
1188+
processMaintenanceMode(clusterName, enabled, reason,
1189+
MaintenanceSignal.AutoTriggerReason.NOT_APPLICABLE, customFields,
1190+
MaintenanceSignal.TriggeringEntity.USER, timeout);
11831191
}
11841192

11851193
/**
@@ -1195,6 +1203,26 @@ private void processMaintenanceMode(String clusterName, final boolean enabled,
11951203
final String reason, final MaintenanceSignal.AutoTriggerReason internalReason,
11961204
final Map<String, String> customFields,
11971205
final MaintenanceSignal.TriggeringEntity triggeringEntity) {
1206+
processMaintenanceMode(clusterName, enabled, reason, internalReason, customFields,
1207+
triggeringEntity, -1);
1208+
}
1209+
1210+
/**
1211+
* Helper method for enabling/disabling maintenance mode.
1212+
* @param clusterName
1213+
* @param enabled
1214+
* @param reason
1215+
* @param internalReason
1216+
* @param customFields
1217+
* @param triggeringEntity
1218+
* @param timeout time in milliseconds after which maintenance mode should be exited automatically.
1219+
* Only applicable when enabled is true. Set to -1 for no automatic exit.
1220+
*/
1221+
private void processMaintenanceMode(String clusterName, final boolean enabled,
1222+
final String reason, final MaintenanceSignal.AutoTriggerReason internalReason,
1223+
final Map<String, String> customFields,
1224+
final MaintenanceSignal.TriggeringEntity triggeringEntity,
1225+
final long timeout) {
11981226
HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor);
11991227
PropertyKey.Builder keyBuilder = accessor.keyBuilder();
12001228
logger.info("Cluster {} {} {} maintenance mode for reason {}.", clusterName,
@@ -1212,6 +1240,15 @@ private void processMaintenanceMode(String clusterName, final boolean enabled,
12121240
}
12131241
maintenanceSignal.setTimestamp(currentTime);
12141242
maintenanceSignal.setTriggeringEntity(triggeringEntity);
1243+
1244+
// Set end time if timeout is provided
1245+
if (timeout > 0) {
1246+
long endTime = currentTime + timeout;
1247+
maintenanceSignal.setEndTime(endTime);
1248+
} else {
1249+
maintenanceSignal.setEndTime(-1); // No automatic exit
1250+
}
1251+
12151252
switch (triggeringEntity) {
12161253
case CONTROLLER:
12171254
// autoEnable
@@ -1234,6 +1271,23 @@ private void processMaintenanceMode(String clusterName, final boolean enabled,
12341271
if (!accessor.createMaintenance(maintenanceSignal)) {
12351272
throw new HelixException("Failed to create maintenance signal!");
12361273
}
1274+
1275+
// If timeout is provided, create a /CONTROLLER/MAINTENANCE_TTL that is a PERSISTENT_WITH_TTL znode
1276+
if (timeout > 0) {
1277+
try {
1278+
String maintenanceTTLPath = "/" + clusterName + "/CONTROLLER/MAINTENANCE_TTL";
1279+
ZNRecord record = new ZNRecord("MAINTENANCE_TTL");
1280+
1281+
boolean success = ((ZkBaseDataAccessor<ZNRecord>) accessor.getBaseDataAccessor())
1282+
.create(maintenanceTTLPath, record, AccessOption.PERSISTENT_WITH_TTL, (int)(timeout / 1000));
1283+
1284+
if (!success) {
1285+
logger.warn("Failed to create TTL znode for maintenance mode. Auto exit may not work.");
1286+
}
1287+
} catch (Exception e) {
1288+
logger.warn("Failed to create TTL znode for maintenance mode. Auto exit may not work. ", e);
1289+
}
1290+
}
12371291
}
12381292

12391293
// Record a MaintenanceSignal history

helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ public class MaintenanceSignal extends PauseSignal {
3232
public enum MaintenanceSignalProperty {
3333
TRIGGERED_BY,
3434
TIMESTAMP,
35-
AUTO_TRIGGER_REASON
35+
AUTO_TRIGGER_REASON,
36+
END_TIME
3637
}
3738

3839
/**
@@ -112,4 +113,21 @@ public void setTimestamp(long timestamp) {
112113
public long getTimestamp() {
113114
return _record.getLongField(MaintenanceSignalProperty.TIMESTAMP.name(), -1);
114115
}
116+
117+
/**
118+
* Sets the end time for maintenance mode.
119+
* @param endTime the time (in milliseconds) when maintenance mode should end. A value of -1 means
120+
* no automatic exit.
121+
*/
122+
public void setEndTime(long endTime) {
123+
_record.setLongField(MaintenanceSignalProperty.END_TIME.name(), endTime);
124+
}
125+
126+
/**
127+
* Returns the end time for maintenance mode.
128+
* @return the time (in milliseconds) when maintenance mode should end. Returns -1 if no end time is set.
129+
*/
130+
public long getEndTime() {
131+
return _record.getLongField(MaintenanceSignalProperty.END_TIME.name(), -1);
132+
}
115133
}

helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ClusterAccessor.java

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -332,9 +332,27 @@ public Response updateCluster(@PathParam("clusterId") String clusterId,
332332
} catch (Exception e) {
333333
// NOP
334334
}
335-
helixAdmin
336-
.manuallyEnableMaintenanceMode(clusterId, command == Command.enableMaintenanceMode,
337-
content, customFieldsMap);
335+
336+
// Check if a timeout is specified
337+
long timeout = -1;
338+
if (customFieldsMap != null) {
339+
try {
340+
String timeoutStr = customFieldsMap.get("timeout");
341+
if (timeoutStr != null && !timeoutStr.isEmpty()) {
342+
timeout = Long.parseLong(timeoutStr);
343+
}
344+
} catch (NumberFormatException nfe) {
345+
LOG.warn("Invalid timeout value specified", nfe);
346+
}
347+
}
348+
349+
if (timeout > 0 && command == Command.enableMaintenanceMode) {
350+
helixAdmin.manuallyEnableMaintenanceModeWithTimeout(clusterId, true, content, timeout,
351+
customFieldsMap);
352+
} else {
353+
helixAdmin.manuallyEnableMaintenanceMode(clusterId, command == Command.enableMaintenanceMode,
354+
content, customFieldsMap);
355+
}
338356
break;
339357
case enableWagedRebalanceForAllResources:
340358
// Enable WAGED rebalance for all resources in the cluster

0 commit comments

Comments
 (0)