-
Notifications
You must be signed in to change notification settings - Fork 577
fix(server): fix the scheduler and the scheduler selection logic #2937
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
b4321eb
d19096d
a32901a
42ee2ba
c976caa
570d670
5915428
e7da7e5
b0f381c
804055d
2e8578e
2c44cee
925c384
3974048
a349f62
3e7bc6f
e6cc98b
e6f6487
b325dba
68b906a
1113520
5ffd20b
a110112
a31e937
d89b9bd
5807fb7
f8fc58a
6dd52e4
af85bef
b332674
cac78f4
28e0390
5e30cac
b70788f
7ba40bd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -33,6 +33,7 @@ | |||||||||||||||||
| import java.util.Map; | ||||||||||||||||||
| import java.util.Objects; | ||||||||||||||||||
| import java.util.Set; | ||||||||||||||||||
| import java.util.UUID; | ||||||||||||||||||
| import java.util.concurrent.ConcurrentHashMap; | ||||||||||||||||||
| import java.util.concurrent.Future; | ||||||||||||||||||
| import java.util.concurrent.TimeUnit; | ||||||||||||||||||
|
|
@@ -68,6 +69,7 @@ | |||||||||||||||||
| import org.apache.hugegraph.config.TypedOption; | ||||||||||||||||||
| import org.apache.hugegraph.event.EventHub; | ||||||||||||||||||
| import org.apache.hugegraph.exception.ExistedException; | ||||||||||||||||||
| import org.apache.hugegraph.exception.NotFoundException; | ||||||||||||||||||
| import org.apache.hugegraph.exception.NotSupportException; | ||||||||||||||||||
| import org.apache.hugegraph.io.HugeGraphSONModule; | ||||||||||||||||||
| import org.apache.hugegraph.k8s.K8sDriver; | ||||||||||||||||||
|
|
@@ -195,7 +197,17 @@ public final class GraphManager { | |||||||||||||||||
| public GraphManager(HugeConfig conf, EventHub hub) { | ||||||||||||||||||
| LOG.info("Init graph manager"); | ||||||||||||||||||
| E.checkArgumentNotNull(conf, "The config can't be null"); | ||||||||||||||||||
|
|
||||||||||||||||||
| // Auto-generate server.id if not configured. | ||||||||||||||||||
| // Random generation is to prevent duplicate id error reports.This id is currently | ||||||||||||||||||
| // meaningless and needs to be completely removed serverInfoManager in | ||||||||||||||||||
| // the future | ||||||||||||||||||
Tsukilc marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||||
| String server = conf.get(ServerOptions.SERVER_ID); | ||||||||||||||||||
| if (StringUtils.isEmpty(server)) { | ||||||||||||||||||
| server = "server-" + UUID.randomUUID().toString().substring(0, 8); | ||||||||||||||||||
| LOG.info("Auto-generated server.id: {}", server); | ||||||||||||||||||
| conf.setProperty(ServerOptions.SERVER_ID.name(), server); | ||||||||||||||||||
| } | ||||||||||||||||||
| String role = conf.get(ServerOptions.SERVER_ROLE); | ||||||||||||||||||
|
|
||||||||||||||||||
| this.config = conf; | ||||||||||||||||||
|
|
@@ -206,10 +218,6 @@ public GraphManager(HugeConfig conf, EventHub hub) { | |||||||||||||||||
| conf.get(ServerOptions.SERVER_DEPLOY_IN_K8S); | ||||||||||||||||||
| this.startIgnoreSingleGraphError = conf.get( | ||||||||||||||||||
| ServerOptions.SERVER_START_IGNORE_SINGLE_GRAPH_ERROR); | ||||||||||||||||||
| E.checkArgument(server != null && !server.isEmpty(), | ||||||||||||||||||
| "The server name can't be null or empty"); | ||||||||||||||||||
| E.checkArgument(role != null && !role.isEmpty(), | ||||||||||||||||||
| "The server role can't be null or empty"); | ||||||||||||||||||
| this.graphsDir = conf.get(ServerOptions.GRAPHS); | ||||||||||||||||||
| this.cluster = conf.get(ServerOptions.CLUSTER); | ||||||||||||||||||
| this.graphSpaces = new ConcurrentHashMap<>(); | ||||||||||||||||||
|
|
@@ -276,7 +284,7 @@ private static String serviceId(String graphSpace, Service.ServiceType type, | |||||||||||||||||
| .replace("_", "-").toLowerCase(); | ||||||||||||||||||
| } | ||||||||||||||||||
|
|
||||||||||||||||||
| private boolean usePD() { | ||||||||||||||||||
| public boolean usePD() { | ||||||||||||||||||
| return this.PDExist; | ||||||||||||||||||
| } | ||||||||||||||||||
|
|
||||||||||||||||||
|
|
@@ -1557,6 +1565,14 @@ private void loadGraph(String name, String graphConfPath) { | |||||||||||||||||
| String raftGroupPeers = this.conf.get(ServerOptions.RAFT_GROUP_PEERS); | ||||||||||||||||||
| config.addProperty(ServerOptions.RAFT_GROUP_PEERS.name(), | ||||||||||||||||||
| raftGroupPeers); | ||||||||||||||||||
|
|
||||||||||||||||||
| // Transfer `pd.peers` from server config to graph config | ||||||||||||||||||
| // Only inject if not already configured in graph config | ||||||||||||||||||
| if (!config.containsKey("pd.peers")) { | ||||||||||||||||||
|
Comment on lines
+1569
to
+1571
|
||||||||||||||||||
| // Transfer `pd.peers` from server config to graph config | |
| // Only inject if not already configured in graph config | |
| if (!config.containsKey("pd.peers")) { | |
| // Transfer `pd.peers` from server config to graph config when PD is used | |
| // Only inject if not already configured in graph config and backend uses PD | |
| String backend = config.get(CoreOptions.BACKEND); | |
| boolean backendUsesPd = "hstore".equalsIgnoreCase(backend); | |
| if (backendUsesPd && !config.containsKey("pd.peers")) { |
Tsukilc marked this conversation as resolved.
Show resolved
Hide resolved
Tsukilc marked this conversation as resolved.
Show resolved
Hide resolved
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -176,7 +176,6 @@ public class StandardHugeGraph implements HugeGraph { | |||||||||||||||||||||||
| private final BackendStoreProvider storeProvider; | ||||||||||||||||||||||||
| private final TinkerPopTransaction tx; | ||||||||||||||||||||||||
| private final RamTable ramtable; | ||||||||||||||||||||||||
| private final String schedulerType; | ||||||||||||||||||||||||
| private volatile boolean started; | ||||||||||||||||||||||||
| private volatile boolean closed; | ||||||||||||||||||||||||
| private volatile GraphMode mode; | ||||||||||||||||||||||||
|
|
@@ -229,7 +228,6 @@ public StandardHugeGraph(HugeConfig config) { | |||||||||||||||||||||||
| this.closed = false; | ||||||||||||||||||||||||
| this.mode = GraphMode.NONE; | ||||||||||||||||||||||||
| this.readMode = GraphReadMode.OLTP_ONLY; | ||||||||||||||||||||||||
| this.schedulerType = config.get(CoreOptions.SCHEDULER_TYPE); | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| LockUtil.init(this.spaceGraphName()); | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
|
|
@@ -315,6 +313,7 @@ public String backend() { | |||||||||||||||||||||||
| return this.storeProvider.type(); | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| @Override | ||||||||||||||||||||||||
| public BackendStoreInfo backendStoreInfo() { | ||||||||||||||||||||||||
| // Just for trigger Tx.getOrNewTransaction, then load 3 stores | ||||||||||||||||||||||||
| // TODO: pass storeProvider.metaStore() | ||||||||||||||||||||||||
|
|
@@ -465,6 +464,7 @@ public void updateTime(Date updateTime) { | |||||||||||||||||||||||
| this.updateTime = updateTime; | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| @Override | ||||||||||||||||||||||||
| public void waitStarted() { | ||||||||||||||||||||||||
| // Just for trigger Tx.getOrNewTransaction, then load 3 stores | ||||||||||||||||||||||||
| this.schemaTransaction(); | ||||||||||||||||||||||||
|
|
@@ -1629,7 +1629,9 @@ public <T> void submitEphemeralJob(EphemeralJob<T> job) { | |||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| @Override | ||||||||||||||||||||||||
| public String schedulerType() { | ||||||||||||||||||||||||
| return StandardHugeGraph.this.schedulerType; | ||||||||||||||||||||||||
| // Use distributed scheduler for hstore backend, otherwise use local | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
| // Use distributed scheduler for hstore backend, otherwise use local | |
| /* | |
| * HStore is a distributed backend: data and tasks may be handled by | |
| * multiple graph servers that must coordinate scheduling and state. | |
| * For this reason we require a distributed task scheduler when the | |
| * backend is hstore so that jobs can be balanced and recovered | |
| * across nodes. For other backends, the graph is served by a single | |
| * server instance and tasks are executed locally, so a local | |
| * in-process scheduler is sufficient and avoids the overhead of | |
| * distributed coordination. | |
| */ |
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -19,7 +19,9 @@ | |||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| import java.util.Iterator; | ||||||||||||||||||||||||||||||
| import java.util.concurrent.Callable; | ||||||||||||||||||||||||||||||
| import java.util.concurrent.CancellationException; | ||||||||||||||||||||||||||||||
| import java.util.concurrent.ConcurrentHashMap; | ||||||||||||||||||||||||||||||
| import java.util.concurrent.ExecutionException; | ||||||||||||||||||||||||||||||
| import java.util.concurrent.ExecutorService; | ||||||||||||||||||||||||||||||
| import java.util.concurrent.Executors; | ||||||||||||||||||||||||||||||
| import java.util.concurrent.Future; | ||||||||||||||||||||||||||||||
|
|
@@ -48,6 +50,7 @@ | |||||||||||||||||||||||||||||
| import org.slf4j.Logger; | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| public class DistributedTaskScheduler extends TaskAndResultScheduler { | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| private static final Logger LOG = Log.logger(DistributedTaskScheduler.class); | ||||||||||||||||||||||||||||||
| private final long schedulePeriod; | ||||||||||||||||||||||||||||||
| private final ExecutorService taskDbExecutor; | ||||||||||||||||||||||||||||||
|
|
@@ -118,6 +121,11 @@ private static boolean sleep(long ms) { | |||||||||||||||||||||||||||||
| public void cronSchedule() { | ||||||||||||||||||||||||||||||
| // Perform periodic scheduling tasks | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| // Check closed flag first to exit early | ||||||||||||||||||||||||||||||
| if (this.closed.get()) { | ||||||||||||||||||||||||||||||
| return; | ||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| if (!this.graph.started() || this.graph.closed()) { | ||||||||||||||||||||||||||||||
| return; | ||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
|
|
@@ -253,6 +261,10 @@ public <V> Future<?> schedule(HugeTask<V> task) { | |||||||||||||||||||||||||||||
| return this.ephemeralTaskExecutor.submit(task); | ||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| // Validate task state before saving to ensure correct exception type | ||||||||||||||||||||||||||||||
| E.checkState(task.type() != null, "Task type can't be null"); | ||||||||||||||||||||||||||||||
| E.checkState(task.name() != null, "Task name can't be null"); | ||||||||||||||||||||||||||||||
Tsukilc marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
Tsukilc marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||||||||||||||||
| // Process schema task | ||||||||||||||||||||||||||||||
| // Handle gremlin task | ||||||||||||||||||||||||||||||
| // Handle OLAP calculation tasks | ||||||||||||||||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 在 if (this.closed.get()) {
return;
}问题: 建议:
Suggested change
将所有状态检查合并到一个 if 语句中,减少竞态窗口。 |
||||||||||||||||||||||||||||||
|
|
@@ -284,14 +296,41 @@ protected <V> void initTaskParams(HugeTask<V> task) { | |||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 在 原逻辑:
新逻辑: if (!force) {
if (!task.completed() && task.status() != TaskStatus.DELETING) {
throw new IllegalArgumentException(
String.format("Can't delete incomplete task '%s' in status %s, " +
"Please try to cancel the task first",
id, task.status()));
}
}
return this.deleteFromDB(id);问题:
建议:重新考虑删除流程,保持与原有逻辑的兼容性,或在 PR 描述中明确说明此行为变更 |
||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| /** | ||||||||||||||||||||||||||||||
| * Note: This method will update the status of the input task. | ||||||||||||||||||||||||||||||
| * | ||||||||||||||||||||||||||||||
| * @param task | ||||||||||||||||||||||||||||||
| * @param <V> | ||||||||||||||||||||||||||||||
| */ | ||||||||||||||||||||||||||||||
| @Override | ||||||||||||||||||||||||||||||
| public <V> void cancel(HugeTask<V> task) { | ||||||||||||||||||||||||||||||
| // Update status to CANCELLING | ||||||||||||||||||||||||||||||
| if (!task.completed()) { | ||||||||||||||||||||||||||||||
| // Task not completed, can only execute status not CANCELLING | ||||||||||||||||||||||||||||||
| this.updateStatus(task.id(), null, TaskStatus.CANCELLING); | ||||||||||||||||||||||||||||||
| E.checkArgumentNotNull(task, "Task can't be null"); | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| if (task.completed() || task.cancelling()) { | ||||||||||||||||||||||||||||||
| return; | ||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| LOG.info("Cancel task '{}' in status {}", task.id(), task.status()); | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| // Check if task is running locally, cancel it directly if so | ||||||||||||||||||||||||||||||
| HugeTask<?> runningTask = this.runningTasks.get(task.id()); | ||||||||||||||||||||||||||||||
| if (runningTask != null) { | ||||||||||||||||||||||||||||||
| boolean cancelled = runningTask.cancel(true); | ||||||||||||||||||||||||||||||
| if (cancelled) { | ||||||||||||||||||||||||||||||
| task.overwriteStatus(TaskStatus.CANCELLED); | ||||||||||||||||||||||||||||||
|
Comment on lines
+318
to
+320
|
||||||||||||||||||||||||||||||
| boolean cancelled = runningTask.cancel(true); | |
| if (cancelled) { | |
| task.overwriteStatus(TaskStatus.CANCELLED); | |
| TaskStatus previousStatus = task.status(); | |
| boolean cancelled = runningTask.cancel(true); | |
| if (cancelled) { | |
| if (this.updateStatus(task.id(), previousStatus, | |
| TaskStatus.CANCELLED)) { | |
| task.overwriteStatus(TaskStatus.CANCELLED); | |
| } else { | |
| LOG.info("Failed to persist cancelled status for task '{}', " + | |
| "status may have changed from {}", | |
| task.id(), previousStatus); | |
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
currentStatus 做 CAS 更新有明显竞态:调用方读取任务后到执行 cancel() 之间,任务可能从 QUEUED 变成 RUNNING,导致 updateStatus(..., currentStatus, CANCELLING) 失败并直接返回,取消请求被静默丢弃。建议 CAS 失败后读取最新状态并做一次兜底更新(只要未完成就转 CANCELLING)。
| if (!this.updateStatus(task.id(), currentStatus, TaskStatus.CANCELLING)) { | |
| if (!this.updateStatus(task.id(), currentStatus, TaskStatus.CANCELLING)) { | |
| HugeTask<?> latest = this.taskWithoutResult(task.id()); | |
| if (!latest.completed() && !latest.cancelling()) { | |
| this.updateStatus(task.id(), null, TaskStatus.CANCELLING); | |
| task.overwriteStatus(TaskStatus.CANCELLING); | |
| } | |
| } else { | |
| task.overwriteStatus(TaskStatus.CANCELLING); | |
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
在 DistributedTaskScheduler.java:323-330:
if (!this.updateStatus(task.id(), currentStatus, TaskStatus.CANCELLING)) {
LOG.info("Failed to cancel task '{}', status may have changed from {}",
task.id(), currentStatus);
} else {
task.overwriteStatus(TaskStatus.CANCELLING);
}问题:
- 当
updateStatus返回 false 时(数据库更新失败),内存中的task对象状态未同步 - 调用者可能认为 cancel 成功,但数据库中状态未改变
- 缺少错误处理:应该重新读取最新状态或抛出异常
建议:
| LOG.info("Failed to cancel task '{}', status may have changed from {}", | |
| TaskStatus currentStatus = task.status(); | |
| if (!this.updateStatus(task.id(), currentStatus, TaskStatus.CANCELLING)) { | |
| // Status changed concurrently, reload from DB | |
| HugeTask<?> latestTask = this.taskWithoutResult(task.id()); | |
| LOG.info("Failed to cancel task '{}': status changed from {} to {}", | |
| task.id(), currentStatus, latestTask.status()); | |
| task.overwriteStatus(latestTask.status()); | |
| } else { | |
| task.overwriteStatus(TaskStatus.CANCELLING); | |
| } |
Tsukilc marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
第 334 行的 TODO 注释过于模糊:
//todo: serverInfoManager section should be removed in the future.
return this.serverManager().close();
//return true;问题:
- 未说明为什么要移除 serverInfoManager
- 未说明移除的时间节点或前提条件
- 注释掉的代码应该删除,而不是保留
建议:
| } | |
| // TODO(issue-XXX): Remove serverInfoManager.close() after migrating to | |
| // pure single-node architecture. Currently kept for backward compatibility. | |
| return this.serverManager().close(); |
Tsukilc marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
在 DistributedTaskScheduler.java:355-366:
if (!force && !task.completed()) {
this.updateStatus(id, null, TaskStatus.DELETING);
return null;
// 下面的代码永远无法执行
}问题:
- Line 360 的注释 "Already in DELETING status" 放在了
return null之后,永远无法到达 - 逻辑不清晰:应该区分「未完成的任务」和「已经是 DELETING 状态的任务」
建议:
| // Completed tasks can also be deleted directly | |
| HugeTask<?> task = this.taskWithoutResult(id); | |
| if (!force && !task.completed()) { | |
| // Non-force mode: mark incomplete tasks as DELETING for async cleanup | |
| this.updateStatus(id, null, TaskStatus.DELETING); | |
| return null; | |
| } | |
| // Force mode OR task is completed OR already DELETING: delete directly | |
| return this.deleteFromDB(id); |
Tsukilc marked this conversation as resolved.
Show resolved
Hide resolved
Tsukilc marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
在 DistributedTaskScheduler.java:399-410:
cronFuture.get(schedulePeriod + 5, TimeUnit.SECONDS);问题:
- 如果 cron 任务超时,后续的
taskDbExecutor清理可能在 cron 任务仍持有事务时执行 - 注释说 "ensure all transactions are closed",但超时时无法保证
schedulePeriod可能很大(如 60 秒),导致关闭等待过长
建议:
| // Wait for cron task with reasonable timeout | |
| long waitTime = Math.min(schedulePeriod + 5, 15); // Cap at 15 seconds | |
| try { | |
| cronFuture.get(waitTime, TimeUnit.SECONDS); | |
| } catch (CancellationException e) { | |
| LOG.debug("Cron task was cancelled"); | |
| } catch (TimeoutException e) { | |
| LOG.warn("Cron task did not complete in {}s, proceeding with shutdown", waitTime); | |
| // Force interrupt the cron task thread if possible | |
| cronFuture.cancel(true); | |
| } catch (ExecutionException | InterruptedException e) { | |
| LOG.warn("Exception while waiting for cron task to complete", e); | |
| } |
Tsukilc marked this conversation as resolved.
Show resolved
Hide resolved
Tsukilc marked this conversation as resolved.
Show resolved
Hide resolved
Tsukilc marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧹 TODO 注释应该追踪到 Issue
在 DistributedTaskScheduler.java:422-424:
//todo: serverInfoManager section should be removed in the future.
return this.serverManager().close();
//return true;建议:
将 TODO 关联到 GitHub Issue,方便追踪:
| //return true; | |
| // TODO(#issue-number): Remove serverInfoManager completely after full single-node migration | |
| return this.serverManager().close(); |
并删除注释掉的代码 //return true;
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
task.scheduler_type,但同目录的server1-conf/server3-conf仍保留该配置项。由于本 PR 同时删除了CoreOptions.SCHEDULER_TYPE,这会导致示例配置不一致,并在启动时产生冗余配置告警。建议同步清理另外两个 docker 配置文件,避免误导运维配置。