Skip to content

Commit 8c5af16

Browse files
committed
slurmctld: start adding documentation
1 parent 18af658 commit 8c5af16

File tree

2 files changed

+199
-36
lines changed

2 files changed

+199
-36
lines changed

pyslurm/core/slurmctld.pxd

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,25 @@ from pyslurm.slurm cimport (
3131
slurm_accounting_enforce_string,
3232
slurm_sprint_cpu_bind_type,
3333
slurm_ctl_conf_2_key_pairs,
34+
slurm_reconfigure,
35+
slurm_shutdown,
36+
slurm_ping,
37+
slurm_takeover,
38+
ping_all_controllers,
39+
controller_ping_t,
3440
cpu_bind_type_t,
3541
try_xmalloc,
3642
list_t,
3743
xfree,
3844
)
3945
from pyslurm.utils cimport cstr
4046
from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t, int64_t
41-
from pyslurm.utils.uint cimport *
47+
from pyslurm.utils.uint cimport (
48+
u16_parse,
49+
u32_parse,
50+
u64_parse,
51+
u16_parse_bool,
52+
)
4253

4354
from pyslurm.db.util cimport (
4455
SlurmList,
@@ -54,6 +65,17 @@ ctypedef struct config_key_pair_t:
5465
char *value
5566

5667

68+
cdef class PingResponse:
69+
"""Slurm Controller Ping response information"""
70+
71+
cdef public:
72+
is_primary
73+
is_responding
74+
index
75+
hostname
76+
latency
77+
78+
5779
cdef class Config:
5880
cdef slurm_conf_t *ptr
5981

@@ -64,6 +86,7 @@ cdef class Config:
6486

6587

6688
cdef class MPIConfig:
89+
"""Slurm MPI Config (mpi.conf)"""
6790

6891
cdef public:
6992
pmix_cli_tmp_dir_base
@@ -83,6 +106,7 @@ cdef class MPIConfig:
83106
cdef MPIConfig from_ptr(void *ptr)
84107

85108
cdef class CgroupConfig:
109+
"""Slurm Cgroup Config (cgroup.conf)"""
86110

87111
cdef public:
88112
mountpoint
@@ -110,6 +134,7 @@ cdef class CgroupConfig:
110134

111135

112136
cdef class AccountingGatherConfig:
137+
"""Slurm Accounting Gather Config (acct_gather.conf)"""
113138

114139
cdef public:
115140
energy_ipmi_frequency

pyslurm/core/slurmctld.pyx

Lines changed: 173 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,167 @@
2323
# cython: language_level=3
2424

2525
from pyslurm.core.error import verify_rpc, RPCError
26-
from pyslurm.utils.uint import *
26+
from pyslurm.utils.uint import (
27+
u16_parse,
28+
u32_parse,
29+
u64_parse,
30+
)
31+
from pyslurm.constants import UNLIMITED
2732
from pyslurm.utils.ctime import _raw_time
2833
from pyslurm.utils.helpers import (
2934
cpu_freq_int_to_str,
3035
instance_to_dict,
3136
)
3237
from pyslurm.utils import cstr
38+
from typing import Union
39+
import time
40+
from enum import IntEnum
41+
42+
43+
class ShutdownMode(IntEnum):
44+
"""Mode of operation for shutdown action"""
45+
ALL = 0
46+
CORE_FILE = 1
47+
CONTROLLER_ONLY = 2
48+
49+
50+
cdef class PingResponse:
51+
52+
def to_dict(self):
53+
"""Slurmctld ping response formatted as dictionary.
54+
55+
Returns:
56+
(dict): Ping response as a dict
57+
58+
Examples:
59+
>>> from pyslurm import slurmctld
60+
>>> ctld_primary = slurmctld.Config.ping(0)
61+
>>> primary_dict = ctld_primary.to_dict()
62+
"""
63+
return instance_to_dict(self)
64+
65+
66+
def ping(index):
67+
"""Ping a Slurm controller
68+
69+
Returns:
70+
(pyslurm.slurmctld.PingResponse): a ping response
71+
72+
Examples:
73+
>>> from pyslurm import slurmctld
74+
>>> resp = slurmctld.ping(0)
75+
>>> print(resp.hostname, resp.latency)
76+
slurmctl 1.246
77+
"""
78+
t0 = time.perf_counter()
79+
rc = slurm_ping(index)
80+
t1 = time.perf_counter()
81+
82+
verify_rpc(rc)
83+
ctl_cnt = slurm.slurm_conf.control_cnt
84+
85+
if index >= ctl_cnt:
86+
raise RPCError(msg="Invalid Index specified.")
87+
88+
info = PingResponse()
89+
info.is_primary = index == 0
90+
info.is_responding = not rc
91+
info.index = index
92+
info.hostname = cstr.to_unicode(slurm.slurm_conf.control_machine[index])
93+
info.latency = round((t1 - t0) * 1000, 3)
94+
95+
return info
96+
97+
98+
def ping_primary():
99+
"""Ping the primary Slurm Controller.
100+
101+
See `ping()` for more information and examples.
102+
103+
Returns:
104+
(pyslurm.slurmctld.PingResponse): a ping response
105+
"""
106+
return ping(0)
107+
108+
109+
def ping_backup():
110+
"""Ping the first backup Slurm Controller.
111+
112+
See `ping()` for more information and examples.
113+
114+
Returns:
115+
(pyslurm.slurmctld.PingResponse): a ping response
116+
"""
117+
return ping(1)
118+
119+
120+
def ping_all():
121+
"""Ping all Slurm Controllers.
122+
123+
Returns:
124+
(list[pyslurm.slurmctld.PingResponse]): a list of ping responses
125+
126+
Raises:
127+
(pyslurm.RPCError): When the ping was not successful.
128+
129+
Examples:
130+
>>> from pyslurm import slurmctld
131+
>>> resps = slurmctld.ping_all()
132+
>>> for resp in resps:
133+
... print(resp.hostname, resp.latency)
134+
...
135+
slurmctl 1.246
136+
slurmctlbackup 1.373
137+
"""
138+
cdef list out = []
139+
140+
ctl_cnt = slurm.slurm_conf.control_cnt
141+
for i in range(ctl_cnt):
142+
out.append(ping(i))
143+
144+
return out
145+
146+
147+
def shutdown(mode: Union[ShutdownMode, int]):
148+
"""Shutdown Slurm Controller or all Daemons
149+
150+
Args:
151+
mode:
152+
Whether only the Slurm controller shut be downed, or also all other
153+
slurmd daemons.
154+
155+
Raises:
156+
(pyslurm.RPCError): When shutdowning the daemons was not successful.
157+
"""
158+
verify_rpc(slurm_shutdown(int(mode)))
159+
160+
161+
def reconfigure():
162+
"""Trigger Slurm Controller to reload the Config
163+
164+
Raises:
165+
(pyslurm.RPCError): When reconfiguring was not successful.
166+
"""
167+
verify_rpc(slurm_reconfigure())
168+
169+
170+
def takeover(index = 1):
171+
"""Let a Backup Slurm Controller take over as the Primary.
172+
173+
Args:
174+
index (int, optional = 1):
175+
Index of the Backup Controller that should take over. By default,
176+
the `index` is `1`, meaning the next Controller configured after
177+
the Primary in slurm.conf (second `SlurmctlHost` entry) will be
178+
asked to take over operation.
179+
180+
If you have more than one backup controller configured, you can for
181+
example also pass `2` as the index.
182+
183+
Raises:
184+
(pyslurm.RPCError): When reconfiguring was not successful.
185+
"""
186+
verify_rpc(slurm_takeover(index))
33187

34188

35189
cdef class MPIConfig:
@@ -75,7 +229,7 @@ cdef class MPIConfig:
75229

76230
cdef class CgroupConfig:
77231

78-
def __init__(self, job_id):
232+
def __init__(self):
79233
raise RuntimeError("Cannot instantiate class directly")
80234

81235
def to_dict(self):
@@ -121,7 +275,7 @@ cdef class CgroupConfig:
121275

122276
cdef class AccountingGatherConfig:
123277

124-
def __init__(self, job_id):
278+
def __init__(self):
125279
raise RuntimeError("Cannot instantiate class directly")
126280

127281
def to_dict(self):
@@ -147,7 +301,7 @@ cdef class AccountingGatherConfig:
147301
out.energy_ipmi_calc_adjustment = _yesno_to_bool(
148302
conf.get("EnergyIPMICalcAdjustment"))
149303

150-
# TODO: dict
304+
# TODO: maybe dict?
151305
out.energy_ipmi_power_sensors = conf.get("EnergyIPMIPowerSensors")
152306

153307
out.energy_ipmi_user_name = conf.get("EnergyIPMIUsername")
@@ -176,8 +330,9 @@ cdef class Config:
176330
def __cinit__(self):
177331
self.ptr = NULL
178332

179-
def __init__(self, job_id):
180-
raise RuntimeError("Cannot instantiate class directly")
333+
def __init__(self):
334+
raise RuntimeError("Cannot instantiate class directly. "
335+
"Use slurmctld.Config.load() to get an instance.")
181336

182337
def __dealloc__(self):
183338
slurm_free_ctl_conf(self.ptr)
@@ -201,13 +356,21 @@ cdef class Config:
201356

202357
@staticmethod
203358
def load():
359+
"""Load the current Slurm configuration (slurm.conf)
360+
361+
This also loads the following other configurations:
362+
* `cgroup.conf` (`cgroup_config`)
363+
* `acct_gather.conf` (`accounting_gather_config`)
364+
* `mpi.conf` (`mpi_config`)
365+
"""
204366
cdef Config conf = Config.__new__(Config)
205367
verify_rpc(slurm_load_ctl_conf(0, &conf.ptr))
206368

207369
conf.cgroup_config = CgroupConfig.from_ptr(conf.ptr.cgroup_conf)
208370
conf.accounting_gather_config = AccountingGatherConfig.from_ptr(
209371
conf.ptr.acct_gather_conf)
210372
conf.mpi_config = MPIConfig.from_ptr(conf.ptr.mpi_conf)
373+
# TODO: node_features_conf
211374

212375
return conf
213376

@@ -431,16 +594,6 @@ cdef class Config:
431594
return cstr.to_list_with_count(self.ptr.epilog_slurmctld,
432595
self.ptr.epilog_slurmctld_cnt)
433596

434-
# @property
435-
# def external_sensors_type(self):
436-
# return cstr.to_unicode(self.ptr.ext_sensors_type)
437-
438-
# @property
439-
# def external_sensors_frequency(self):
440-
# return u16_parse(self.ptr.ext_sensors_freq)
441-
442-
# TODO: void *ext_sensors_conf put into own class?
443-
444597
@property
445598
def federation_parameters(self):
446599
return cstr.to_list(self.ptr.fed_params)
@@ -469,7 +622,6 @@ cdef class Config:
469622

470623
@property
471624
def group_update_force(self):
472-
# TODO: maybe bool?
473625
return u16_parse_bool(self.ptr.group_force)
474626

475627
@property
@@ -485,7 +637,6 @@ cdef class Config:
485637
val = u32_parse(self.ptr.hash_val)
486638
if not val:
487639
return None
488-
489640
return hex(val)
490641

491642
@property
@@ -534,10 +685,6 @@ cdef class Config:
534685
def job_completion_parameters(self):
535686
return cstr.to_list(self.ptr.job_comp_params)
536687

537-
# @property
538-
# def job_completion_password(self):
539-
# return cstr.to_unicode(self.ptr.job_comp_pass)
540-
541688
@property
542689
def job_completion_port(self):
543690
return u32_parse(self.ptr.job_comp_port)
@@ -675,8 +822,6 @@ cdef class Config:
675822
def next_job_id(self):
676823
return u32_parse(self.ptr.next_job_id)
677824

678-
# TODO: void *node_features_conf put into own class?
679-
680825
@property
681826
def node_features_plugins(self):
682827
return cstr.to_list(self.ptr.node_features_plugins)
@@ -686,22 +831,13 @@ cdef class Config:
686831
return u16_parse(self.ptr.over_time_limit)
687832

688833
@property
689-
def plugin_path(self):
690-
# TODO: maybe list
691-
return cstr.to_unicode(self.ptr.plugindir)
834+
def plugin_dirs(self):
835+
return cstr.to_list(self.ptr.plugindir, None, ":")
692836

693837
@property
694838
def plugin_stack_config(self):
695839
return cstr.to_unicode(self.ptr.plugstack)
696840

697-
# @property
698-
# def power_parameters(self):
699-
# return cstr.to_list(self.ptr.power_parameters)
700-
701-
# @property
702-
# def power_plugin(self):
703-
# return cstr.to_unicode(self.ptr.power_plugin)
704-
705841
@property
706842
def preempt_exempt_time(self):
707843
# seconds?
@@ -1295,6 +1431,7 @@ def _log_level_int_to_str(flags):
12951431
else:
12961432
return data
12971433

1434+
12981435
def _acct_store_flags_int_to_str(flags):
12991436
cdef list out = []
13001437

@@ -1311,6 +1448,7 @@ def _acct_store_flags_int_to_str(flags):
13111448

13121449
return out
13131450

1451+
13141452
def _get_memory(value, per_cpu):
13151453
if value != slurm.NO_VAL64:
13161454
if value & slurm.MEM_PER_CPU and per_cpu:

0 commit comments

Comments
 (0)