Skip to content

Commit 951df2e

Browse files
authored
Merge pull request #187 from firstbatchxyz/erhant/better-pull-and-env
feat: add custom env option & better pulls
2 parents 5a31963 + 7a1c50e commit 951df2e

File tree

13 files changed

+302
-318
lines changed

13 files changed

+302
-318
lines changed

Cargo.lock

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ default-members = ["compute"]
77

88
[workspace.package]
99
edition = "2021"
10-
version = "0.3.8"
10+
version = "0.3.9"
1111
license = "Apache-2.0"
1212
readme = "README.md"
1313

Makefile

Lines changed: 4 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -5,39 +5,19 @@ ifneq (,$(wildcard ./.env))
55
endif
66

77
###############################################################################
8-
.PHONY: launch # | Run with INFO logs in release mode
9-
launch:
10-
cargo run --release --bin dkn-compute
11-
12-
.PHONY: run # | Run with INFO logs
13-
run:
14-
cargo run --bin dkn-compute
15-
16-
.PHONY: monitor # | Run monitor node with INFO logs
17-
monitor:
18-
cargo run --bin dkn-monitor
19-
208
.PHONY: debug # | Run with DEBUG logs with INFO log-level workflows
219
debug:
2210
RUST_LOG=warn,dkn_compute=debug,dkn_workflows=debug,dkn_p2p=debug,ollama_workflows=info \
2311
cargo run --bin dkn-compute
2412

25-
.PHONY: trace # | Run with TRACE logs
26-
trace:
27-
RUST_LOG=warn,dkn_compute=trace,libp2p=debug \
28-
cargo run --bin dkn-compute
29-
3013
.PHONY: build # | Build
3114
build:
3215
cargo build --workspace
3316

34-
.PHONY: profile-cpu # | Profile CPU usage with flamegraph
35-
profile-cpu:
36-
DKN_EXIT_TIMEOUT=120 cargo flamegraph --root --profile=profiling --bin dkn-compute
37-
38-
.PHONY: profile-mem # | Profile memory usage with instruments
39-
profile-mem:
40-
DKN_EXIT_TIMEOUT=120 cargo instruments --profile=profiling -t Allocations --bin dkn-compute
17+
.PHONY: trace # | Run with TRACE logs
18+
trace:
19+
RUST_LOG=warn,dkn_compute=trace,libp2p=debug \
20+
cargo run --bin dkn-compute
4121

4222
.PHONY: ollama-versions
4323
ollama-versions:

README.md

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,11 +65,13 @@ If you would like to run the node from source (which is really handy during deve
6565
make help
6666
```
6767

68-
You will need OpenSSL installed as well, see shorthand commands [here](https://github.com/sfackler/rust-openssl/issues/855#issuecomment-450057552). While running Ollama elsewhere (if you are using it) or with an OpenAI API key provided, you can run the compute node with:
68+
You will need OpenSSL installed, see shorthand commands [here](https://github.com/sfackler/rust-openssl/issues/855#issuecomment-450057552).
6969

7070
```sh
71-
make run # info-level logs
72-
make debug # debug-level logs
71+
cargo run
72+
73+
# specify custom .env file
74+
DKN_COMPUTE_ENV=./path/to/.env cargo run
7375
```
7476

7577
If you have a valid `.env` file, you can run the latest Docker image via compose as well:
@@ -112,6 +114,32 @@ make lint # clippy
112114
make format # rustfmt
113115
```
114116

117+
### Profiling
118+
119+
We have scripts to profile both CPU and Memory usage. A special build is created for profiling, via a custom `profiling` feature, such that the output inherits `release` mode but also has debug symbols.
120+
121+
Furthermore, the profiling build will exit automatically after a certain time, as if CTRL+C has been pressed. This is needed by the memory profiling tool in particular.
122+
123+
**CPU Profiling**: To create a [flamegraph](https://crates.io/crates/flamegraph) of the application, the command below will create a profiling build that inherits `release` mode, except with debug information:
124+
125+
```sh
126+
DKN_EXIT_TIMEOUT=120 cargo flamegraph --root --profile=profiling --bin dkn-compute
127+
```
128+
129+
> [!NOTE]
130+
>
131+
> CPU profiling may require super-user access.
132+
133+
**Memory Profiling**: To profile memory usage, we make use of [cargo-instruments](https://crates.io/crates/cargo-instruments):
134+
135+
```sh
136+
DKN_EXIT_TIMEOUT=120 cargo instruments --profile=profiling -t Allocations --bin dkn-compute
137+
```
138+
139+
> [!TIP]
140+
>
141+
> You can adjust the profiling duration via the `DKN_EXIT_TIMEOUT` variable, which takes a number of seconds until termination.
142+
115143
## License
116144

117145
This project is licensed under the [Apache License 2.0](https://opensource.org/license/Apache-2.0).

compute/src/gossipsub/pingpong.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,9 @@ impl PingpongHandler {
6565

6666
log::info!("Received a {} for: {}", "ping".blue(), pingpong.uuid);
6767

68-
// record ping moment
68+
// record ping
6969
node.last_pinged_at = Instant::now();
70+
node.num_pings += 1;
7071

7172
// respond
7273
let response_body = PingpongResponse {

compute/src/main.rs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@ use workers::task::TaskWorker;
77

88
#[tokio::main]
99
async fn main() -> Result<()> {
10-
let dotenv_result = dotenvy::dotenv();
10+
// load a particular environment file specified by DKN_COMPUTE_ENV, or `.env` by default
11+
let env_path = env::var("DKN_COMPUTE_ENV").unwrap_or_else(|_| ".env".to_string());
12+
let dotenv_result = dotenvy::from_path(&env_path);
1113

1214
env_logger::builder()
1315
.format_timestamp(Some(env_logger::TimestampPrecision::Millis))
@@ -33,8 +35,8 @@ async fn main() -> Result<()> {
3335

3436
// log about env usage
3537
match dotenv_result {
36-
Ok(path) => log::info!("Loaded .env file at: {}", path.display()),
37-
Err(e) => log::warn!("Could not load .env file: {}", e),
38+
Ok(_) => log::info!("Loaded environment file from {}", env_path),
39+
Err(e) => log::warn!("Could not load environment file from {}: {}", env_path, e),
3840
}
3941

4042
// task tracker for multiple threads

compute/src/node/diagnostic.rs

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,6 @@ impl DriaComputeNode {
2222
pub(crate) async fn handle_diagnostic_refresh(&self) {
2323
let mut diagnostics = vec![format!("Diagnostics (v{}):", DRIA_COMPUTE_NODE_VERSION)];
2424

25-
// if we have not received pings for a while, we are considered offline
26-
let is_offline =
27-
self.last_pinged_at < Instant::now() - Duration::from_secs(PING_LIVENESS_SECS);
28-
2925
// print peer counts
3026
match self.p2p.peer_counts().await {
3127
Ok((mesh, all)) => diagnostics.push(format!(
@@ -74,14 +70,27 @@ impl DriaComputeNode {
7470
));
7571

7672
// add network status as well
77-
diagnostics.push(format!(
78-
"Node Status: {}",
79-
if is_offline {
80-
"OFFLINE".bold().red()
81-
} else {
82-
"ONLINE".bold().green()
83-
}
84-
));
73+
// if we have not received pings for a while, we are considered offline
74+
let is_offline = Instant::now().duration_since(self.last_pinged_at)
75+
> Duration::from_secs(PING_LIVENESS_SECS);
76+
if self.num_pings == 0 {
77+
// if we didnt have any pings, we might still be connecting
78+
diagnostics.push(format!("Node Status: {}", "CONNECTING".yellow()));
79+
} else {
80+
diagnostics.push(format!(
81+
"Node Status: {}",
82+
if is_offline {
83+
"OFFLINE".red()
84+
} else {
85+
"ONLINE".green()
86+
}
87+
));
88+
}
89+
90+
// add pings per second
91+
let elapsed = Instant::now().duration_since(self.started_at).as_secs_f64();
92+
let pings_per_second = self.num_pings as f64 / elapsed; // elapsed is always > 0
93+
diagnostics.push(format!("Pings/sec: {:.3}", pings_per_second));
8594

8695
log::info!("{}", diagnostics.join("\n "));
8796

compute/src/node/mod.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,11 @@ pub struct DriaComputeNode {
3333
pub p2p: DriaP2PCommander,
3434
/// The last time the node was pinged by the network.
3535
/// If this is too much, we can say that the node is not reachable by RPC.
36-
pub last_pinged_at: Instant,
36+
pub(crate) last_pinged_at: Instant,
37+
/// Number of pings received.
38+
pub(crate) num_pings: u64,
39+
/// The time the node was started.
40+
pub(crate) started_at: Instant,
3741
/// Gossipsub message receiver, used by peer-to-peer client in a separate thread.
3842
///
3943
/// It will publish messages sent to this channel to the network.
@@ -143,6 +147,8 @@ impl DriaComputeNode {
143147
initial_steps,
144148
spec_collector: SpecCollector::new(model_names),
145149
last_pinged_at: Instant::now(),
150+
num_pings: 0,
151+
started_at: Instant::now(),
146152
},
147153
p2p_client,
148154
task_batch_worker,

docs/NODE_PERFORMANCE.md

Lines changed: 0 additions & 51 deletions
This file was deleted.

0 commit comments

Comments
 (0)