Skip to content

Commit a222db3

Browse files
committed
feat: Add/Rework slurmctld statistics (#370)
* add slurmctld stats * remove scripts/slurm_msg_type_dict.py This is superseded by the fact that we now use libslurmfull and can access the rpc_num2string function provided there. * Backport Changes: Make it work with 23.11 * Backport Changes: schedule_exit and bf_exit removed, they were only added in 23.11 (cherry picked from commit 5a2fcd5) (cherry picked from commit eedfffa)
1 parent c61651b commit a222db3

File tree

7 files changed

+857
-46
lines changed

7 files changed

+857
-46
lines changed

docs/reference/slurmctld.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@ title: slurmctld
66
handler: python
77
options:
88
members: yes
9+
members_order: source

pyslurm/core/slurmctld/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,16 @@
55
CgroupConfig,
66
)
77
from .enums import ShutdownMode
8+
from .stats import (
9+
diag,
10+
Statistics,
11+
RPCPending,
12+
RPCUser,
13+
RPCType,
14+
RPCPendingStatistics,
15+
RPCUserStatistics,
16+
RPCTypeStatistics,
17+
)
818
from .base import (
919
PingResponse,
1020
ping,

pyslurm/core/slurmctld/stats.pxd

Lines changed: 363 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,363 @@
1+
#########################################################################
2+
# slurmctld/stats.pxd - pyslurm slurmctld statistics api (sdiag)
3+
#########################################################################
4+
# Copyright (C) 2025 Toni Harzendorf <toni.harzendorf@gmail.com>
5+
#
6+
#########################################################################
7+
# Much of the documentation here (with some modifications) has been taken from:
8+
# - https://slurm.schedmd.com/sdiag.html
9+
# - https://github.com/SchedMD/slurm/blob/c28fcf4f15981f891df7893099bceda21e2c5e6e/src/sdiag/sdiag.c
10+
#
11+
# So for completeness, the appropriate Copyright notices are also written
12+
# below:
13+
#
14+
# Copyright (C) 2010-2011 Barcelona Supercomputing Center.
15+
# Copyright (C) 2010-2022 SchedMD LLC.
16+
#
17+
# Please also check the Slurm DISCLAIMER at: pyslurm/slurm/SLURM_DISCLAIMER
18+
#########################################################################
19+
20+
# This file is part of PySlurm
21+
#
22+
# PySlurm is free software; you can redistribute it and/or modify
23+
# it under the terms of the GNU General Public License as published by
24+
# the Free Software Foundation; either version 2 of the License, or
25+
# (at your option) any later version.
26+
27+
# PySlurm is distributed in the hope that it will be useful,
28+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
29+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30+
# GNU General Public License for more details.
31+
#
32+
# You should have received a copy of the GNU General Public License along
33+
# with PySlurm; if not, write to the Free Software Foundation, Inc.,
34+
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
35+
#
36+
# cython: c_string_type=unicode, c_string_encoding=default
37+
# cython: language_level=3
38+
39+
from libc.string cimport memset
40+
from pyslurm cimport slurm
41+
from pyslurm.slurm cimport (
42+
stats_info_response_msg_t,
43+
stats_info_request_msg_t,
44+
slurm_get_statistics,
45+
slurm_reset_statistics,
46+
slurm_free_stats_response_msg,
47+
xfree,
48+
xmalloc,
49+
)
50+
from pyslurm.utils cimport cstr
51+
from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t, int64_t
52+
from pyslurm.utils.uint cimport (
53+
u16_parse,
54+
u32_parse,
55+
u64_parse,
56+
u16_parse_bool,
57+
)
58+
59+
cdef extern const char *rpc_num2string(uint16_t msg_type)
60+
61+
cdef parse_response(stats_info_response_msg_t *ptr)
62+
63+
64+
cdef class RPCPending:
65+
"""Statistics for a pending RPC.
66+
67+
Attributes:
68+
id (int):
69+
The numeric ID of the RPC type.
70+
name (str):
71+
The string representation of the RPC.
72+
count (int):
73+
How many RPCs are pending of this type.
74+
"""
75+
cdef public:
76+
id
77+
name
78+
count
79+
80+
81+
cdef class RPCType:
82+
"""Statistics for a specific RPC Type.
83+
84+
Attributes:
85+
id (int):
86+
The numeric ID of the RPC Type
87+
name (str):
88+
The string representation of the RPC
89+
count (int):
90+
How many times this RPC was issued since the last time the
91+
statistics were cleared.
92+
time (int):
93+
How much total time it has taken to process this RPC. The unit is
94+
microseconds
95+
average_time (int):
96+
How much time on average it has taken to process this RPC. The unit
97+
is microseconds.
98+
"""
99+
cdef public:
100+
id
101+
name
102+
count
103+
time
104+
average_time
105+
106+
107+
cdef class RPCUser:
108+
"""RPC Statistics for a specific User.
109+
110+
Attributes:
111+
user_id (int):
112+
The numeric ID of the User.
113+
user_name (str):
114+
The name of the User.
115+
count (int):
116+
How many times the User issued RPCs since the last time the
117+
statistics were cleared.
118+
time (int):
119+
How much total time it has taken to process RPCs by this User. The
120+
unit is microseconds
121+
average_time (int):
122+
How much time on average it has taken to process RPCs by this User.
123+
The unit is microseconds.
124+
"""
125+
cdef public:
126+
user_id
127+
user_name
128+
count
129+
time
130+
average_time
131+
132+
133+
cdef class RPCTypeStatistics(dict):
134+
"""Collection of [pyslurm.slurmctld.RPCType][] objects.
135+
136+
Attributes:
137+
count (int):
138+
Total amount of RPCs made to the `slurmctld` since last reset.
139+
time (int):
140+
Total amount of time it has taken to process all RPCs made yet.
141+
queued (int):
142+
Total amount of RPCs queued.
143+
dropped (int):
144+
Total amount of RPCs dropped.
145+
"""
146+
@staticmethod
147+
cdef RPCTypeStatistics from_ptr(stats_info_response_msg_t *ptr)
148+
149+
150+
cdef class RPCUserStatistics(dict):
151+
"""Collection of [pyslurm.slurmctld.RPCUser][] objects.
152+
153+
Attributes:
154+
count (int):
155+
Total amount of RPCs made to the `slurmctld` since last reset.
156+
time (int):
157+
Total amount of time it has taken to process all RPCs made yet.
158+
"""
159+
@staticmethod
160+
cdef RPCUserStatistics from_ptr(stats_info_response_msg_t *ptr)
161+
162+
163+
cdef class RPCPendingStatistics(dict):
164+
"""Collection of [pyslurm.slurmctld.RPCPending][] objects.
165+
166+
Attributes:
167+
count (int):
168+
Total amount of RPCs currently pending.
169+
"""
170+
@staticmethod
171+
cdef RPCPendingStatistics from_ptr(stats_info_response_msg_t *ptr)
172+
173+
174+
cdef class Statistics:
175+
"""Statistics for the `slurmctld`.
176+
177+
For more information, also check out the Slurm [sdiag documentation](https://slurm.schedmd.com/sdiag.html).
178+
179+
Attributes:
180+
request_time (int):
181+
Time when the data was requested. This is a unix timestamp.
182+
data_since (int):
183+
The date when `slurmctld` started gathering statistics. This is a
184+
unix timestamp.
185+
server_thread_count (int):
186+
The number of current active `slurmctld` threads.
187+
agent_queue_size (int):
188+
Count of enqueued outgoing RPC requests in an internal retry list.
189+
agent_count (int):
190+
Number of agent threads.
191+
agent_thread_count (int):
192+
Total count of active threads created by all the agent threads.
193+
dbd_agent_queue_size (int):
194+
Number of messages intended for the `slurmdbd`. If the `slurmdbd`
195+
goes down, then this number starts going up.
196+
jobs_submitted (int):
197+
Number of jobs submitted since last reset
198+
jobs_started (int):
199+
Number of jobs started since last reset. This includes backfilled
200+
jobs.
201+
jobs_completed (int):
202+
Number of jobs completed since last reset.
203+
jobs_canceled (int):
204+
Number of jobs canceled since last reset.
205+
jobs_failed (int):
206+
Number of jobs failed due to `slurmd` or other internal issues since
207+
last reset.
208+
jobs_pending (int):
209+
Number of jobs pending.
210+
jobs_running (int):
211+
Number of jobs running.
212+
schedule_cycle_last (int):
213+
Time in microseconds for last scheduling cycle.
214+
schedule_cycle_max (int):
215+
Maximum time in microseconds for any scheduling cycle since last
216+
reset.
217+
schedule_cycle_counter (int):
218+
Total amount of scheduling cycles ran since last reset.
219+
schedule_cycle_mean (int):
220+
Mean time in microseconds for all scheduling cycles since last
221+
reset.
222+
schedule_cycle_mean_depth (int):
223+
Mean of cycle depth. Depth means number of jobs processed in a
224+
scheduling cycle.
225+
schedule_cycle_sum (int):
226+
Total run time in microseconds for all scheduling cycles since last
227+
reset.
228+
schedule_cycles_per_minute (int):
229+
Counter of scheduling executions per minute.
230+
schedule_queue_length (int):
231+
Length of jobs pending queue.
232+
backfill_active (bool):
233+
Whether these statistics have been gathered during backfilling
234+
operation.
235+
backfilled_jobs (int):
236+
Number of jobs started thanks to backfilling since last slurm
237+
start.
238+
last_backfilled_jobs (int):
239+
Number of jobs started thanks to backfilling since last time stats
240+
where reset. (which is midnight UTC time in this case)
241+
backfilled_het_jobs (int):
242+
Number of heterogeneous job components started thanks to
243+
backfilling since last Slurm start.
244+
backfill_cycle_counter (int):
245+
Number of backfill scheduling cycles since last reset.
246+
backfill_cycle_last_when (int):
247+
Time when last backfill scheduling cycle happened. This is a unix
248+
timestamp.
249+
backfill_cycle_last (int):
250+
Time in microseconds of last backfill scheduling cycle.
251+
backfill_cycle_max (int):
252+
Time in microseconds of maximum backfill scheduling cycle execution
253+
since last reset.
254+
backfill_cycle_mean (int):
255+
Mean time in microseconds of backfilling scheduling cycles since
256+
last reset.
257+
backfill_cycle_sum (int):
258+
Total time in microseconds of backfilling scheduling cycles since
259+
last reset.
260+
backfill_last_depth (int):
261+
Number of processed jobs during last backfilling scheduling cycle.
262+
It counts every job even if that job can not be started due to
263+
dependencies or limits.
264+
backfill_depth_sum (int):
265+
Total number of jobs processed during all backfilling scheduling
266+
cycles since last reset.
267+
backfill_last_depth_try (int):
268+
Number of processed jobs during last backfilling scheduling cycle.
269+
It counts only jobs with a chance to start using available
270+
resources.
271+
backfill_depth_try_sum (int):
272+
Subset of `backfill_depth_sum` that the backfill scheduler
273+
attempted to schedule.
274+
backfill_mean_depth (int):
275+
Mean count of jobs processed during all backfilling scheduling
276+
cycles since last reset. Jobs which are found to be ineligible to
277+
run when examined by the backfill scheduler are not counted.
278+
backfill_mean_depth_try (int):
279+
The subset of `backfill_mean_depth` that the backfill
280+
scheduler attempted to schedule.
281+
backfill_queue_length (int):
282+
Number of jobs pending to be processed by backfilling algorithm. A
283+
job is counted once for each partition it is queued to use.
284+
backfill_queue_length_sum (int):
285+
Total number of jobs pending to be processed by backfilling
286+
algorithm since last reset.
287+
backfill_queue_length_mean (int):
288+
Mean count of jobs pending to be processed by backfilling
289+
algorithm.
290+
backfill_table_size (int):
291+
Count of different time slots tested by the backfill scheduler in
292+
its last iteration.
293+
backfill_table_size_sum (int):
294+
Total number of different time slots tested by the backfill
295+
scheduler.
296+
backfill_table_size_mean (int):
297+
Mean count of different time slots tested by the backfill
298+
scheduler. Larger counts increase the time required for the
299+
backfill operation.
300+
gettimeofday_latency (int):
301+
Latency of 1000 calls to the gettimeofday() syscall in
302+
microseconds, as measured at controller startup.
303+
rpcs_by_type (pyslurm.slurmctld.RPCTypeStatistics):
304+
RPC Statistics organized by Type.
305+
rpcs_by_user (pyslurm.slurmctld.RPCUserStatistics):
306+
RPC Statistics organized by User.
307+
rpcs_pending (pyslurm.slurmctld.RPCPendingStatistics):
308+
Statistics for pending RPCs.
309+
"""
310+
cdef public:
311+
request_time
312+
data_since
313+
server_thread_count
314+
agent_queue_size
315+
agent_count
316+
agent_thread_count
317+
dbd_agent_queue_size
318+
319+
jobs_submitted
320+
jobs_started
321+
jobs_completed
322+
jobs_canceled
323+
jobs_failed
324+
jobs_pending
325+
jobs_running
326+
327+
schedule_cycle_last
328+
schedule_cycle_max
329+
schedule_cycle_counter
330+
schedule_cycle_mean
331+
schedule_cycle_mean_depth
332+
schedule_cycle_sum
333+
schedule_cycles_per_minute
334+
schedule_queue_length
335+
336+
backfill_active
337+
backfilled_jobs
338+
last_backfilled_jobs
339+
backfilled_het_jobs
340+
backfill_cycle_counter
341+
backfill_cycle_last_when
342+
backfill_cycle_last
343+
backfill_cycle_max
344+
backfill_cycle_mean
345+
backfill_cycle_sum
346+
backfill_last_depth
347+
backfill_depth_sum
348+
backfill_last_depth_try
349+
backfill_depth_try_sum
350+
backfill_mean_depth
351+
backfill_mean_depth_try
352+
backfill_queue_length
353+
backfill_queue_length_sum
354+
backfill_queue_length_mean
355+
backfill_table_size
356+
backfill_table_size_sum
357+
backfill_table_size_mean
358+
359+
gettimeofday_latency
360+
361+
rpcs_by_type
362+
rpcs_by_user
363+
rpcs_pending

0 commit comments

Comments
 (0)