Skip to content

Commit f5a61da

Browse files
committed
Fix gpconfig for resource group v2 on segment hosts
The gpcheckresgroupv2impl script failed on segments when running `gpconfig -c gp_resource_manager -v "group-v2"`. Root cause: The validation script tried to connect to localhost:5432 on each host to retrieve gp_resource_group_cgroup_parent. However, segment hosts don't run the master database - causing "Connection refused" errors. Fix: - Retrieve gp_resource_group_cgroup_parent from master database in gpresgroup.py before dispatching validation commands; - Pass the cgroup_parent value to gpcheckresgroupv2impl via command line argument (--cgroup-parent); - Remove database connection logic from gpcheckresgroupv2impl; - Move gp_resource_group_cgroup_parent from unsync_guc_name.h to sync_guc_name.h to ensure consistent value across segments;
1 parent 68e5248 commit f5a61da

File tree

4 files changed

+42
-36
lines changed

4 files changed

+42
-36
lines changed

gpMgmt/bin/gpcheckresgroupv2impl

Lines changed: 12 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,9 @@
33

44
import os
55
import sys
6+
import argparse
67
from functools import reduce
78

8-
# Add the gppylib path to sys.path to import database connection modules
9-
try:
10-
from gppylib.db import dbconn
11-
from pg import DatabaseError
12-
except ImportError as err:
13-
sys.exit('Cannot import modules. Please check that you have sourced '
14-
'cloudberry-env.sh. Detail: ' + str(err))
15-
169

1710
class ValidationException(Exception):
1811
def __init__(self, message):
@@ -35,10 +28,10 @@ class CgroupValidation(object):
3528

3629

3730
class CgroupValidationVersionTwo(CgroupValidation):
38-
def __init__(self):
31+
def __init__(self, cgroup_parent=None):
3932
self.mount_point = self.detect_cgroup_mount_point()
4033
self.tab = {"r": os.R_OK, "w": os.W_OK, "x": os.X_OK, "f": os.F_OK}
41-
self.cgroup_parent = self.get_cgroup_parent()
34+
self.cgroup_parent = cgroup_parent if cgroup_parent else "gpdb.service"
4235

4336
def validate_all(self):
4437
"""
@@ -71,29 +64,6 @@ class CgroupValidationVersionTwo(CgroupValidation):
7164

7265
self.validate_permission(self.cgroup_parent + "/io.max", "rw")
7366

74-
def get_cgroup_parent(self):
75-
"""
76-
Get the cgroup parent directory from the database GUC parameter
77-
gp_resource_group_cgroup_parent. If unable to connect to database
78-
or retrieve the parameter, report error using die function.
79-
"""
80-
try:
81-
dburl = dbconn.DbURL()
82-
83-
with dbconn.connect(dburl, utility=True) as conn:
84-
# Query the GUC parameter value
85-
sql = "SHOW gp_resource_group_cgroup_parent"
86-
cursor = dbconn.query(conn, sql)
87-
result = cursor.fetchone()
88-
89-
if result and result[0]:
90-
return result[0]
91-
else:
92-
self.die("failed to retrieve gp_resource_group_cgroup_parent parameter from database")
93-
94-
except Exception as e:
95-
self.die("failed to retrieve gp_resource_group_cgroup_parent parameter: {}".format(str(e)))
96-
9767
def die(self, msg):
9868
raise ValidationException("cgroup is not properly configured: {}".format(msg))
9969

@@ -118,7 +88,15 @@ class CgroupValidationVersionTwo(CgroupValidation):
11888

11989

12090
if __name__ == '__main__':
91+
parser = argparse.ArgumentParser(description='Validate cgroup v2 configuration for resource groups')
92+
parser.add_argument('--cgroup-parent',
93+
dest='cgroup_parent',
94+
default=None,
95+
help='The cgroup parent directory name (gp_resource_group_cgroup_parent value)')
96+
97+
args = parser.parse_args()
98+
12199
try:
122-
CgroupValidationVersionTwo().validate_all()
100+
CgroupValidationVersionTwo(cgroup_parent=args.cgroup_parent).validate_all()
123101
except ValidationException as e:
124102
exit(e.message)

gpMgmt/bin/gppylib/gpresgroup.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from gppylib.commands.gp import *
99
from gppylib.gparray import GpArray
1010
from gppylib.gplog import get_default_logger
11+
from gppylib.db import dbconn
1112

1213

1314
class GpResGroup(object):
@@ -40,13 +41,40 @@ def validate():
4041

4142
@staticmethod
4243
def validate_v2():
44+
"""
45+
Validate cgroup v2 configuration on all hosts.
46+
47+
This method:
48+
1. Connects to the master database to retrieve gp_resource_group_cgroup_parent
49+
2. Passes this value to gpcheckresgroupv2impl on each host via command line
50+
3. Each host validates its local cgroup filesystem permissions
51+
"""
4352
pool = base.WorkerPool()
4453
gp_array = GpArray.initFromCatalog(dbconn.DbURL(), utility=True)
4554
host_list = list(set(gp_array.get_hostlist(True)))
4655
msg = None
4756

57+
# Get cgroup_parent value from master database
58+
cgroup_parent = None
59+
try:
60+
# Connect to master database to get the GUC parameter
61+
master_dburl = dbconn.DbURL()
62+
with dbconn.connect(master_dburl, utility=True) as conn:
63+
sql = "SHOW gp_resource_group_cgroup_parent"
64+
cursor = dbconn.query(conn, sql)
65+
result = cursor.fetchone()
66+
if result and result[0]:
67+
cgroup_parent = result[0]
68+
else:
69+
return "failed to retrieve gp_resource_group_cgroup_parent parameter from master database"
70+
except Exception as e:
71+
return "failed to retrieve gp_resource_group_cgroup_parent parameter: {}".format(str(e))
72+
73+
# Build command with cgroup_parent parameter
74+
cmd_str = "gpcheckresgroupv2impl --cgroup-parent '{}'".format(cgroup_parent)
75+
4876
for h in host_list:
49-
cmd = Command(h, "gpcheckresgroupv2impl", REMOTE, h)
77+
cmd = Command(h, cmd_str, REMOTE, h)
5078
pool.addCommand(cmd)
5179
pool.join()
5280

src/include/utils/sync_guc_name.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@
102102
"gp_motion_slice_noop",
103103
"gp_random_insert_segments",
104104
"gp_resgroup_debug_wait_queue",
105+
"gp_resource_group_cgroup_parent",
105106
"gp_resgroup_memory_policy_auto_fixed_mem",
106107
"gp_resqueue_memory_policy_auto_fixed_mem",
107108
"gp_resqueue_print_operator_memory_limits",

src/include/utils/unsync_guc_name.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,6 @@
260260
"gp_resource_group_bypass_direct_dispatch",
261261
"gp_resource_group_queuing_timeout",
262262
"gp_resource_group_move_timeout",
263-
"gp_resource_group_cgroup_parent",
264263
"gp_resource_manager",
265264
"gp_resqueue_memory_policy",
266265
"gp_resqueue_priority",

0 commit comments

Comments
 (0)