|
| 1 | +######################################################################### |
| 2 | +# slurmctld/stats.pxd - pyslurm slurmctld statistics api (sdiag) |
| 3 | +######################################################################### |
| 4 | +# Copyright (C) 2025 Toni Harzendorf <toni.harzendorf@gmail.com> |
| 5 | +# |
| 6 | +######################################################################### |
| 7 | +# Much of the documentation here (with some modifications) has been taken from: |
| 8 | +# - https://slurm.schedmd.com/sdiag.html |
| 9 | +# - https://github.com/SchedMD/slurm/blob/c28fcf4f15981f891df7893099bceda21e2c5e6e/src/sdiag/sdiag.c |
| 10 | +# |
| 11 | +# So for completeness, the appropriate Copyright notices are also written |
| 12 | +# below: |
| 13 | +# |
| 14 | +# Copyright (C) 2010-2011 Barcelona Supercomputing Center. |
| 15 | +# Copyright (C) 2010-2022 SchedMD LLC. |
| 16 | +# |
| 17 | +# Please also check the Slurm DISCLAIMER at: pyslurm/slurm/SLURM_DISCLAIMER |
| 18 | +######################################################################### |
| 19 | + |
| 20 | +# This file is part of PySlurm |
| 21 | +# |
| 22 | +# PySlurm is free software; you can redistribute it and/or modify |
| 23 | +# it under the terms of the GNU General Public License as published by |
| 24 | +# the Free Software Foundation; either version 2 of the License, or |
| 25 | +# (at your option) any later version. |
| 26 | + |
| 27 | +# PySlurm is distributed in the hope that it will be useful, |
| 28 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 29 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 30 | +# GNU General Public License for more details. |
| 31 | +# |
| 32 | +# You should have received a copy of the GNU General Public License along |
| 33 | +# with PySlurm; if not, write to the Free Software Foundation, Inc., |
| 34 | +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| 35 | +# |
| 36 | +# cython: c_string_type=unicode, c_string_encoding=default |
| 37 | +# cython: language_level=3 |
| 38 | + |
| 39 | +from libc.string cimport memset |
| 40 | +from pyslurm cimport slurm |
| 41 | +from pyslurm.slurm cimport ( |
| 42 | + stats_info_response_msg_t, |
| 43 | + stats_info_request_msg_t, |
| 44 | + slurm_get_statistics, |
| 45 | + slurm_reset_statistics, |
| 46 | + slurm_free_stats_response_msg, |
| 47 | + xfree, |
| 48 | + xmalloc, |
| 49 | +) |
| 50 | +from pyslurm.utils cimport cstr |
| 51 | +from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t, int64_t |
| 52 | +from pyslurm.utils.uint cimport ( |
| 53 | + u16_parse, |
| 54 | + u32_parse, |
| 55 | + u64_parse, |
| 56 | + u16_parse_bool, |
| 57 | +) |
| 58 | + |
| 59 | +cdef extern const char *rpc_num2string(uint16_t msg_type) |
| 60 | + |
| 61 | +cdef parse_response(stats_info_response_msg_t *ptr) |
| 62 | + |
| 63 | + |
| 64 | +cdef class RPCPending: |
| 65 | + """Statistics for a pending RPC. |
| 66 | +
|
| 67 | + Attributes: |
| 68 | + id (int): |
| 69 | + The numeric ID of the RPC type. |
| 70 | + name (str): |
| 71 | + The string representation of the RPC. |
| 72 | + count (int): |
| 73 | + How many RPCs are pending of this type. |
| 74 | + """ |
| 75 | + cdef public: |
| 76 | + id |
| 77 | + name |
| 78 | + count |
| 79 | + |
| 80 | + |
| 81 | +cdef class RPCType: |
| 82 | + """Statistics for a specific RPC Type. |
| 83 | +
|
| 84 | + Attributes: |
| 85 | + id (int): |
| 86 | + The numeric ID of the RPC Type |
| 87 | + name (str): |
| 88 | + The string representation of the RPC |
| 89 | + count (int): |
| 90 | + How many times this RPC was issued since the last time the |
| 91 | + statistics were cleared. |
| 92 | + time (int): |
| 93 | + How much total time it has taken to process this RPC. The unit is |
| 94 | + microseconds |
| 95 | + average_time (int): |
| 96 | + How much time on average it has taken to process this RPC. The unit |
| 97 | + is microseconds. |
| 98 | + """ |
| 99 | + cdef public: |
| 100 | + id |
| 101 | + name |
| 102 | + count |
| 103 | + time |
| 104 | + average_time |
| 105 | + |
| 106 | + |
| 107 | +cdef class RPCUser: |
| 108 | + """RPC Statistics for a specific User. |
| 109 | +
|
| 110 | + Attributes: |
| 111 | + user_id (int): |
| 112 | + The numeric ID of the User. |
| 113 | + user_name (str): |
| 114 | + The name of the User. |
| 115 | + count (int): |
| 116 | + How many times the User issued RPCs since the last time the |
| 117 | + statistics were cleared. |
| 118 | + time (int): |
| 119 | + How much total time it has taken to process RPCs by this User. The |
| 120 | + unit is microseconds |
| 121 | + average_time (int): |
| 122 | + How much time on average it has taken to process RPCs by this User. |
| 123 | + The unit is microseconds. |
| 124 | + """ |
| 125 | + cdef public: |
| 126 | + user_id |
| 127 | + user_name |
| 128 | + count |
| 129 | + time |
| 130 | + average_time |
| 131 | + |
| 132 | + |
| 133 | +cdef class RPCTypeStatistics(dict): |
| 134 | + """Collection of [pyslurm.slurmctld.RPCType][] objects. |
| 135 | +
|
| 136 | + Attributes: |
| 137 | + count (int): |
| 138 | + Total amount of RPCs made to the `slurmctld` since last reset. |
| 139 | + time (int): |
| 140 | + Total amount of time it has taken to process all RPCs made yet. |
| 141 | + queued (int): |
| 142 | + Total amount of RPCs queued. |
| 143 | + dropped (int): |
| 144 | + Total amount of RPCs dropped. |
| 145 | + """ |
| 146 | + @staticmethod |
| 147 | + cdef RPCTypeStatistics from_ptr(stats_info_response_msg_t *ptr) |
| 148 | + |
| 149 | + |
| 150 | +cdef class RPCUserStatistics(dict): |
| 151 | + """Collection of [pyslurm.slurmctld.RPCUser][] objects. |
| 152 | +
|
| 153 | + Attributes: |
| 154 | + count (int): |
| 155 | + Total amount of RPCs made to the `slurmctld` since last reset. |
| 156 | + time (int): |
| 157 | + Total amount of time it has taken to process all RPCs made yet. |
| 158 | + """ |
| 159 | + @staticmethod |
| 160 | + cdef RPCUserStatistics from_ptr(stats_info_response_msg_t *ptr) |
| 161 | + |
| 162 | + |
| 163 | +cdef class RPCPendingStatistics(dict): |
| 164 | + """Collection of [pyslurm.slurmctld.RPCPending][] objects. |
| 165 | +
|
| 166 | + Attributes: |
| 167 | + count (int): |
| 168 | + Total amount of RPCs currently pending. |
| 169 | + """ |
| 170 | + @staticmethod |
| 171 | + cdef RPCPendingStatistics from_ptr(stats_info_response_msg_t *ptr) |
| 172 | + |
| 173 | + |
| 174 | +cdef class Statistics: |
| 175 | + """Statistics for the `slurmctld`. |
| 176 | +
|
| 177 | + For more information, also check out the Slurm [sdiag documentation](https://slurm.schedmd.com/sdiag.html). |
| 178 | +
|
| 179 | + Attributes: |
| 180 | + request_time (int): |
| 181 | + Time when the data was requested. This is a unix timestamp. |
| 182 | + data_since (int): |
| 183 | + The date when `slurmctld` started gathering statistics. This is a |
| 184 | + unix timestamp. |
| 185 | + server_thread_count (int): |
| 186 | + The number of current active `slurmctld` threads. |
| 187 | + agent_queue_size (int): |
| 188 | + Count of enqueued outgoing RPC requests in an internal retry list. |
| 189 | + agent_count (int): |
| 190 | + Number of agent threads. |
| 191 | + agent_thread_count (int): |
| 192 | + Total count of active threads created by all the agent threads. |
| 193 | + dbd_agent_queue_size (int): |
| 194 | + Number of messages intended for the `slurmdbd`. If the `slurmdbd` |
| 195 | + goes down, then this number starts going up. |
| 196 | + jobs_submitted (int): |
| 197 | + Number of jobs submitted since last reset |
| 198 | + jobs_started (int): |
| 199 | + Number of jobs started since last reset. This includes backfilled |
| 200 | + jobs. |
| 201 | + jobs_completed (int): |
| 202 | + Number of jobs completed since last reset. |
| 203 | + jobs_canceled (int): |
| 204 | + Number of jobs canceled since last reset. |
| 205 | + jobs_failed (int): |
| 206 | + Number of jobs failed due to `slurmd` or other internal issues since |
| 207 | + last reset. |
| 208 | + jobs_pending (int): |
| 209 | + Number of jobs pending. |
| 210 | + jobs_running (int): |
| 211 | + Number of jobs running. |
| 212 | + schedule_cycle_last (int): |
| 213 | + Time in microseconds for last scheduling cycle. |
| 214 | + schedule_cycle_max (int): |
| 215 | + Maximum time in microseconds for any scheduling cycle since last |
| 216 | + reset. |
| 217 | + schedule_cycle_counter (int): |
| 218 | + Total amount of scheduling cycles ran since last reset. |
| 219 | + schedule_cycle_mean (int): |
| 220 | + Mean time in microseconds for all scheduling cycles since last |
| 221 | + reset. |
| 222 | + schedule_cycle_mean_depth (int): |
| 223 | + Mean of cycle depth. Depth means number of jobs processed in a |
| 224 | + scheduling cycle. |
| 225 | + schedule_cycle_sum (int): |
| 226 | + Total run time in microseconds for all scheduling cycles since last |
| 227 | + reset. |
| 228 | + schedule_cycles_per_minute (int): |
| 229 | + Counter of scheduling executions per minute. |
| 230 | + schedule_queue_length (int): |
| 231 | + Length of jobs pending queue. |
| 232 | + backfill_active (bool): |
| 233 | + Whether these statistics have been gathered during backfilling |
| 234 | + operation. |
| 235 | + backfilled_jobs (int): |
| 236 | + Number of jobs started thanks to backfilling since last slurm |
| 237 | + start. |
| 238 | + last_backfilled_jobs (int): |
| 239 | + Number of jobs started thanks to backfilling since last time stats |
| 240 | + where reset. (which is midnight UTC time in this case) |
| 241 | + backfilled_het_jobs (int): |
| 242 | + Number of heterogeneous job components started thanks to |
| 243 | + backfilling since last Slurm start. |
| 244 | + backfill_cycle_counter (int): |
| 245 | + Number of backfill scheduling cycles since last reset. |
| 246 | + backfill_cycle_last_when (int): |
| 247 | + Time when last backfill scheduling cycle happened. This is a unix |
| 248 | + timestamp. |
| 249 | + backfill_cycle_last (int): |
| 250 | + Time in microseconds of last backfill scheduling cycle. |
| 251 | + backfill_cycle_max (int): |
| 252 | + Time in microseconds of maximum backfill scheduling cycle execution |
| 253 | + since last reset. |
| 254 | + backfill_cycle_mean (int): |
| 255 | + Mean time in microseconds of backfilling scheduling cycles since |
| 256 | + last reset. |
| 257 | + backfill_cycle_sum (int): |
| 258 | + Total time in microseconds of backfilling scheduling cycles since |
| 259 | + last reset. |
| 260 | + backfill_last_depth (int): |
| 261 | + Number of processed jobs during last backfilling scheduling cycle. |
| 262 | + It counts every job even if that job can not be started due to |
| 263 | + dependencies or limits. |
| 264 | + backfill_depth_sum (int): |
| 265 | + Total number of jobs processed during all backfilling scheduling |
| 266 | + cycles since last reset. |
| 267 | + backfill_last_depth_try (int): |
| 268 | + Number of processed jobs during last backfilling scheduling cycle. |
| 269 | + It counts only jobs with a chance to start using available |
| 270 | + resources. |
| 271 | + backfill_depth_try_sum (int): |
| 272 | + Subset of `backfill_depth_sum` that the backfill scheduler |
| 273 | + attempted to schedule. |
| 274 | + backfill_mean_depth (int): |
| 275 | + Mean count of jobs processed during all backfilling scheduling |
| 276 | + cycles since last reset. Jobs which are found to be ineligible to |
| 277 | + run when examined by the backfill scheduler are not counted. |
| 278 | + backfill_mean_depth_try (int): |
| 279 | + The subset of `backfill_mean_depth` that the backfill |
| 280 | + scheduler attempted to schedule. |
| 281 | + backfill_queue_length (int): |
| 282 | + Number of jobs pending to be processed by backfilling algorithm. A |
| 283 | + job is counted once for each partition it is queued to use. |
| 284 | + backfill_queue_length_sum (int): |
| 285 | + Total number of jobs pending to be processed by backfilling |
| 286 | + algorithm since last reset. |
| 287 | + backfill_queue_length_mean (int): |
| 288 | + Mean count of jobs pending to be processed by backfilling |
| 289 | + algorithm. |
| 290 | + backfill_table_size (int): |
| 291 | + Count of different time slots tested by the backfill scheduler in |
| 292 | + its last iteration. |
| 293 | + backfill_table_size_sum (int): |
| 294 | + Total number of different time slots tested by the backfill |
| 295 | + scheduler. |
| 296 | + backfill_table_size_mean (int): |
| 297 | + Mean count of different time slots tested by the backfill |
| 298 | + scheduler. Larger counts increase the time required for the |
| 299 | + backfill operation. |
| 300 | + gettimeofday_latency (int): |
| 301 | + Latency of 1000 calls to the gettimeofday() syscall in |
| 302 | + microseconds, as measured at controller startup. |
| 303 | + rpcs_by_type (pyslurm.slurmctld.RPCTypeStatistics): |
| 304 | + RPC Statistics organized by Type. |
| 305 | + rpcs_by_user (pyslurm.slurmctld.RPCUserStatistics): |
| 306 | + RPC Statistics organized by User. |
| 307 | + rpcs_pending (pyslurm.slurmctld.RPCPendingStatistics): |
| 308 | + Statistics for pending RPCs. |
| 309 | + """ |
| 310 | + cdef public: |
| 311 | + request_time |
| 312 | + data_since |
| 313 | + server_thread_count |
| 314 | + agent_queue_size |
| 315 | + agent_count |
| 316 | + agent_thread_count |
| 317 | + dbd_agent_queue_size |
| 318 | + |
| 319 | + jobs_submitted |
| 320 | + jobs_started |
| 321 | + jobs_completed |
| 322 | + jobs_canceled |
| 323 | + jobs_failed |
| 324 | + jobs_pending |
| 325 | + jobs_running |
| 326 | + |
| 327 | + schedule_cycle_last |
| 328 | + schedule_cycle_max |
| 329 | + schedule_cycle_counter |
| 330 | + schedule_cycle_mean |
| 331 | + schedule_cycle_mean_depth |
| 332 | + schedule_cycle_sum |
| 333 | + schedule_cycles_per_minute |
| 334 | + schedule_queue_length |
| 335 | + |
| 336 | + backfill_active |
| 337 | + backfilled_jobs |
| 338 | + last_backfilled_jobs |
| 339 | + backfilled_het_jobs |
| 340 | + backfill_cycle_counter |
| 341 | + backfill_cycle_last_when |
| 342 | + backfill_cycle_last |
| 343 | + backfill_cycle_max |
| 344 | + backfill_cycle_mean |
| 345 | + backfill_cycle_sum |
| 346 | + backfill_last_depth |
| 347 | + backfill_depth_sum |
| 348 | + backfill_last_depth_try |
| 349 | + backfill_depth_try_sum |
| 350 | + backfill_mean_depth |
| 351 | + backfill_mean_depth_try |
| 352 | + backfill_queue_length |
| 353 | + backfill_queue_length_sum |
| 354 | + backfill_queue_length_mean |
| 355 | + backfill_table_size |
| 356 | + backfill_table_size_sum |
| 357 | + backfill_table_size_mean |
| 358 | + |
| 359 | + gettimeofday_latency |
| 360 | + |
| 361 | + rpcs_by_type |
| 362 | + rpcs_by_user |
| 363 | + rpcs_pending |
0 commit comments