Skip to content

Commit cd61ab0

Browse files
authored
fix(dre): Fix for the HostOS rollout in groups (#432)
The HostOS rollout did not have the local registry correctly populated so it couldn't calculate the rollout strategy.
1 parent e9d875b commit cd61ab0

File tree

3 files changed

+78
-28
lines changed

3 files changed

+78
-28
lines changed

rs/cli/src/cli.rs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,16 @@ pub mod hostos {
332332
All,
333333
}
334334

335+
impl std::fmt::Display for NodeOwner {
336+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
337+
match self {
338+
NodeOwner::Dfinity => write!(f, "DFINITY"),
339+
NodeOwner::Others => write!(f, "External"),
340+
NodeOwner::All => write!(f, "DFINITY+External"),
341+
}
342+
}
343+
}
344+
335345
#[derive(ValueEnum, Copy, Clone, Debug, Ord, Eq, PartialEq, PartialOrd, Default)]
336346
pub enum NodeAssignment {
337347
Unassigned,
@@ -340,6 +350,16 @@ pub mod hostos {
340350
All,
341351
}
342352

353+
impl std::fmt::Display for NodeAssignment {
354+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
355+
match self {
356+
NodeAssignment::Unassigned => write!(f, "Unassigned"),
357+
NodeAssignment::Assigned => write!(f, "In Subnet"),
358+
NodeAssignment::All => write!(f, "In Subnet+Unassigned"),
359+
}
360+
}
361+
}
362+
343363
use super::*;
344364
use clap::ValueEnum;
345365
use ic_base_types::PrincipalId;

rs/cli/src/operations/hostos_rollout.rs

Lines changed: 50 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ pub struct HostosRolloutSubnetAffected {
3030
pub subnet_size: usize,
3131
}
3232

33-
#[derive(Eq, PartialEq)]
33+
#[derive(Eq, PartialEq, Debug)]
3434
pub enum HostosRolloutReason {
3535
NoNodeHealthy,
3636
NoNodeWithoutProposal,
@@ -275,23 +275,7 @@ impl HostosRollout {
275275
info!("All valid nodes in the subnet: {} have been updated", subnet_id);
276276
None
277277
} else {
278-
if nodes.len() < nodes_to_take {
279-
debug!(
280-
"Updating all valid nodes ({}) left in the subnet: {}\n\
281-
{}% of all nodes in the subnet",
282-
nodes.len(),
283-
subnet_id,
284-
actual_percent
285-
);
286-
} else {
287-
debug!(
288-
"Updating {} valid nodes in the subnet: {}\n\
289-
{}% of all nodes in the subnet",
290-
nodes.len(),
291-
subnet_id,
292-
actual_percent
293-
);
294-
}
278+
info!("Updating {} nodes ({}%) in the subnet {}", nodes.len(), actual_percent, subnet_id,);
295279
Some(nodes)
296280
}
297281
})
@@ -385,7 +369,22 @@ impl HostosRollout {
385369
nodes_with_open_proposals: Vec<UpdateNodesHostosVersionsProposal>,
386370
update_group: NodeGroupUpdate,
387371
) -> anyhow::Result<HostosRolloutResponse> {
388-
info!("CANDIDATES SELECTION FOR {:?}", &update_group);
372+
info!(
373+
"CANDIDATES SELECTION FROM {} HEALTHY NODES FOR {} {} {}",
374+
nodes_health
375+
.iter()
376+
.filter_map(|(principal, status)| {
377+
if *status == Status::Healthy {
378+
Some(principal)
379+
} else {
380+
None
381+
}
382+
})
383+
.count(),
384+
update_group.maybe_number_nodes.unwrap_or_default(),
385+
update_group.node_group.owner,
386+
update_group.node_group.assignment
387+
);
389388

390389
match update_group.node_group.assignment {
391390
NodeAssignment::Unassigned => {
@@ -398,13 +397,18 @@ impl HostosRollout {
398397
CandidatesSelection::Ok(candidates_unassigned) => {
399398
let nodes_to_take = update_group.nodes_to_take(unassigned_nodes.len());
400399
let nodes_to_update = candidates_unassigned.into_iter().take(nodes_to_take).collect::<Vec<_>>();
400+
info!("{} candidate nodes selected for: {}", nodes_to_update.len(), update_group.node_group);
401401
Ok(HostosRolloutResponse::Ok(nodes_to_update, None))
402402
}
403-
CandidatesSelection::None(reason) => Ok(HostosRolloutResponse::None(vec![(update_group.node_group, reason)])),
403+
CandidatesSelection::None(reason) => {
404+
info!("No candidate nodes selected for: {} ==> {:?}", update_group.node_group, reason);
405+
Ok(HostosRolloutResponse::None(vec![(update_group.node_group, reason)]))
406+
}
404407
}
405408
}
406409
NodeAssignment::Assigned => {
407410
let assigned_nodes = self.filter_nodes_in_group(update_group).await?;
411+
info!("{} candidate nodes selected for: {}", assigned_nodes.len(), update_group.node_group);
408412

409413
match self
410414
.candidates_selection(nodes_health, nodes_with_open_proposals, assigned_nodes.clone())
@@ -428,9 +432,13 @@ impl HostosRollout {
428432
})
429433
.collect::<Vec<HostosRolloutSubnetAffected>>();
430434

435+
info!("{} candidate nodes selected for: {}", nodes_to_update.len(), update_group.node_group);
431436
Ok(HostosRolloutResponse::Ok(nodes_to_update, Some(subnets_affected)))
432437
}
433-
CandidatesSelection::None(reason) => Ok(HostosRolloutResponse::None(vec![(update_group.node_group, reason)])),
438+
CandidatesSelection::None(reason) => {
439+
info!("No candidate nodes selected for: {} ==> {:?}", update_group.node_group, reason);
440+
Ok(HostosRolloutResponse::None(vec![(update_group.node_group, reason)]))
441+
}
434442
}
435443
}
436444
NodeAssignment::All => {
@@ -449,14 +457,32 @@ impl HostosRollout {
449457
)
450458
.await
451459
.map(|response| match response {
452-
(Ok(assigned_nodes, subnet_affected), None(_)) => Ok(assigned_nodes, subnet_affected),
453-
(None(_), Ok(unassigned_nodes, _)) => Ok(unassigned_nodes, Option::None),
460+
(Ok(assigned_nodes, subnet_affected), None(reason)) => {
461+
info!("No unassigned nodes selected for: {:?} ==> {:?}", update_group.node_group, reason);
462+
Ok(assigned_nodes, subnet_affected)
463+
}
464+
(None(reason), Ok(unassigned_nodes, _)) => {
465+
info!("No assigned nodes selected for: {:?} ==> {:?}", update_group.node_group, reason);
466+
Ok(unassigned_nodes, Option::None)
467+
}
454468

455469
(Ok(assigned_nodes, subnet_affected), Ok(unassigned_nodes, _)) => {
470+
info!(
471+
"{} assigned nodes and {} unassigned nodes selected for: {}",
472+
assigned_nodes.len(),
473+
unassigned_nodes.len(),
474+
update_group.node_group
475+
);
456476
Ok(assigned_nodes.into_iter().chain(unassigned_nodes).collect(), subnet_affected.clone())
457477
}
458478

459-
(None(assigned_reason), None(unassigned_reason)) => None(assigned_reason.into_iter().chain(unassigned_reason).collect()),
479+
(None(assigned_reason), None(unassigned_reason)) => {
480+
info!(
481+
"No candidate nodes selected for: {:?} ==> {:?} {:?}",
482+
update_group.node_group, assigned_reason, unassigned_reason
483+
);
484+
None(assigned_reason.into_iter().chain(unassigned_reason).collect())
485+
}
460486
})
461487
}
462488
}

rs/cli/src/runner.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,7 @@ impl Runner {
4646
pub fn as_automation(self) -> Self {
4747
Self {
4848
ic_admin: self.ic_admin.as_automation(),
49-
dashboard_backend_client: self.dashboard_backend_client,
50-
registry: self.registry,
49+
..self
5150
}
5251
}
5352

@@ -182,12 +181,17 @@ impl Runner {
182181
let backend_url = format!("http://localhost:{}/", backend_port);
183182

184183
let dashboard_backend_client = DashboardBackendClient::new_with_backend_url(backend_url);
184+
let mut registry = registry::RegistryState::new(network, true).await;
185+
let node_providers = query_ic_dashboard_list::<NodeProvidersResponse>("v3/node-providers")
186+
.await?
187+
.node_providers;
188+
registry.update_node_details(&node_providers).await?;
189+
185190
Ok(Self {
186191
ic_admin,
187192
dashboard_backend_client,
188193
// TODO: Remove once DREL-118 completed.
189-
// Fake registry that is not used, but some methods still rely on backend.
190-
registry: registry::RegistryState::new(network, true).await,
194+
registry,
191195
})
192196
}
193197

0 commit comments

Comments
 (0)