[WIP] feat: D node sends a pull signal to P node while having sufficient resources.

tzh21 · tzh21 · commit 46553949533c · 2025-09-15T12:57:09.000+08:00
diff --git a/xllm/core/common/macros.h b/xllm/core/common/macros.h
@@ -62,6 +62,6 @@ namespace xllm {
 
 #define CALLBACK_WITH_ERROR(CODE, MSG) callback(Status{CODE, MSG});
 
-#define DLOG VLOG(1) << "[Local offline] "
+#define DVLOG VLOG(1) << "[Offline pull] "
 
 }  // namespace xllm
diff --git a/xllm/core/distributed_runtime/pd_ooc_service.cpp b/xllm/core/distributed_runtime/pd_ooc_service.cpp
@@ -62,4 +62,12 @@ void PDOOCService::FirstGeneration(
   pd_ooc_service_impl_->decode_recv_first_generation(request, response);
 }
 
+void PDOOCService::SendPullSignal(::google::protobuf::RpcController* controller,
+                                  const proto::PullSignal* request,
+                                  proto::Status* response,
+                                  ::google::protobuf::Closure* done) {
+  brpc::ClosureGuard done_guard(done);
+  pd_ooc_service_impl_->prefill_recv_pull_signal(request, response);
+}
+
 }  // namespace xllm
diff --git a/xllm/core/distributed_runtime/pd_ooc_service.h b/xllm/core/distributed_runtime/pd_ooc_service.h
@@ -50,6 +50,11 @@ class PDOOCService : public proto::PDOOCService {
                        proto::Status* response,
                        ::google::protobuf::Closure* done) override;
 
+  void SendPullSignal(::google::protobuf::RpcController* controller,
+                      const proto::PullSignal* request,
+                      proto::Status* response,
+                      ::google::protobuf::Closure* done) override;
+
  private:
   DISALLOW_COPY_AND_ASSIGN(PDOOCService);
   std::unique_ptr<PDOOCServiceImpl> pd_ooc_service_impl_;
diff --git a/xllm/core/distributed_runtime/pd_ooc_service_impl.cpp b/xllm/core/distributed_runtime/pd_ooc_service_impl.cpp
@@ -286,4 +286,15 @@ void PDOOCServiceImpl::prefill_recv_generations(
   }
 }
 
+void PDOOCServiceImpl::prefill_recv_pull_signal(
+    const proto::PullSignal* request,
+    proto::Status* response) {
+  // Put the pull signal into a queue and response
+  bool result = scheduler_->write_pull_signal(proto::PullSignal(*request));
+
+  if (response) {
+    response->set_ok(result);
+  }
+}
+
 }  // namespace xllm
diff --git a/xllm/core/distributed_runtime/pd_ooc_service_impl.h b/xllm/core/distributed_runtime/pd_ooc_service_impl.h
@@ -45,6 +45,9 @@ class PDOOCServiceImplInterface {
   virtual void prefill_recv_generations(
       const proto::DisaggStreamGenerations* requests,
       proto::StatusSet* responses) {}
+
+  virtual void prefill_recv_pull_signal(const proto::PullSignal* request,
+                                        proto::Status* response) {}
 };
 
 class PDOOCServiceImpl final : public PDOOCServiceImplInterface {
@@ -64,6 +67,9 @@ class PDOOCServiceImpl final : public PDOOCServiceImplInterface {
   void decode_recv_first_generation(const proto::DisaggGenerations* request,
                                     proto::Status* response) override;
 
+  void prefill_recv_pull_signal(const proto::PullSignal* request,
+                                proto::Status* response) override;
+
  private:
   std::shared_ptr<Request> generate_request(const proto::DisaggRequest& req);
 
diff --git a/xllm/core/runtime/xservice_client.cpp b/xllm/core/runtime/xservice_client.cpp
@@ -403,6 +403,23 @@ std::vector<std::string> XServiceClient::get_static_decode_list() {
   return std::vector<std::string>(resp.names().begin(), resp.names().end());
 }
 
+std::vector<std::string> XServiceClient::get_static_prefill_list() {
+  brpc::Controller cntl;
+  xllm_service::proto::InstanceID req;
+  xllm_service::proto::InstanceIDs resp;
+  req.set_name(instance_name_);
+  {
+    std::shared_lock<std::shared_mutex> lock(mutex_);
+    xservice_stub_->GetStaticPrefillList(&cntl, &req, &resp, nullptr);
+  }
+  if (cntl.Failed()) {
+    LOG(ERROR) << "Fail to get static prefill list from xservice server "
+               << xservice_addr_ << ", error text: " << cntl.ErrorText();
+    return {};
+  }
+  return std::vector<std::string>(resp.names().begin(), resp.names().end());
+}
+
 ServiceConfig XServiceClient::get_config() {
   brpc::Controller cntl;
   xllm_service::proto::Empty req;
diff --git a/xllm/core/runtime/xservice_client.h b/xllm/core/runtime/xservice_client.h
@@ -56,6 +56,7 @@ class XServiceClient {
   void heartbeat();
   InstanceInfo get_instance_info(const std::string& instance_name);
   std::vector<std::string> get_static_decode_list();
+  std::vector<std::string> get_static_prefill_list();
   ServiceConfig get_config();
 
   // response generation tokens to xllm service
diff --git a/xllm/core/scheduler/continuous_scheduler.cpp b/xllm/core/scheduler/continuous_scheduler.cpp
@@ -612,9 +612,9 @@ std::vector<Batch> ContinuousScheduler::prepare_batch() {
   while (request_queue_.read(request)) {
     CHECK(request);
 
-    if (request->offline()) {
-      DLOG << "Read an offline request from request_queue_";
-    }
+    // if (request->offline()) {
+    //   DVLOG << "Read an offline request from request_queue_";
+    // }
 
     // expand sequences to the target number if prefix cache is disabled.
     if (!enable_prefix_cache_) {
@@ -625,10 +625,12 @@ std::vector<Batch> ContinuousScheduler::prepare_batch() {
     if (request->sequences()[0]->kv_state().kv_cache_tokens_num() == 0) {
       if (request->offline()) {
         waiting_priority_queue_offline_.push(request);
-        DLOG << "Put an offline request into waiting_priority_queue_offline_";
+        // DVLOG << "Put an offline request into
+        // waiting_priority_queue_offline_";
       } else {
         waiting_priority_queue_.push(request);
-        DLOG << "Put an online request into waiting_priority_queue_offline_";
+        // DVLOG << "Put an online request into
+        // waiting_priority_queue_offline_";
       }
     } else {
       // request from prefill instance in disagge pd mode.
@@ -646,7 +648,7 @@ std::vector<Batch> ContinuousScheduler::prepare_batch() {
     std::shared_ptr<Request> request = *it;
     request->update_connection_status();
     if (request->finished() || request->cancelled()) {
-      DLOG << "Found a finished request in running_requests_";
+      // DVLOG << "Found a finished request in running_requests_";
       block_manager_pool_->deallocate(request.get());
       // release the ownership of the request
       finished_requests.emplace_back(request);
@@ -671,10 +673,10 @@ std::vector<Batch> ContinuousScheduler::prepare_batch() {
         handle_running_requests(*it);
         if ((*it)->offline()) {
           running_queue_offline_->push(*it, last_step_prefill_);
-          DLOG << "Put an offline request into running_queue_offline_";
+          // DVLOG << "Put an offline request into running_queue_offline_";
         } else {
           running_queue_->push(*it, last_step_prefill_);
-          DLOG << "Put an online request into running_queue_";
+          // DVLOG << "Put an online request into running_queue_";
         }
       }
     } else {
@@ -697,16 +699,17 @@ std::vector<Batch> ContinuousScheduler::prepare_batch() {
         handle_running_requests(*it);
         if ((*it)->offline()) {
           running_queue_offline_->push(*it, last_step_prefill_);
-          DLOG << "Pushed an offline request into running_queue_offline_";
+          // DVLOG << "Pushed an offline request into running_queue_offline_";
         } else {
           running_queue_->push(*it, last_step_prefill_);
-          DLOG << "Pushed an online request into running_queue_";
+          // DVLOG << "Pushed an online request into running_queue_";
         }
       }
     }
   } else {
-    DLOG << "Using unknown priority_strategy: " << options_.priority_strategy();
-    // directly push running requests to the priority queue
+    // DVLOG << "Using unknown priority_strategy: " <<
+    // options_.priority_strategy(); directly push running requests to the
+    // priority queue
     for (auto it = running_requests_.begin(); it != running_requests_.end();
          ++it) {
       if (*it == nullptr) {
@@ -715,10 +718,10 @@ std::vector<Batch> ContinuousScheduler::prepare_batch() {
       handle_running_requests(*it);
       if ((*it)->offline()) {
         running_queue_offline_->push(*it);
-        DLOG << "Pushed an offline request into running_queue_offline_";
+        // DVLOG << "Pushed an offline request into running_queue_offline_";
       } else {
         running_queue_->push(*it);
-        DLOG << "Pushed an online request into running_queue_";
+        // DVLOG << "Pushed an online request into running_queue_";
       }
     }
   }
@@ -827,7 +830,7 @@ std::vector<Batch> ContinuousScheduler::prepare_batch() {
   GAUGE_SET(num_free_blocks, util::max(block_manager_pool_->num_free_blocks()));
   GAUGE_SET(num_used_blocks, util::min(block_manager_pool_->num_used_blocks()));
   if (!batches[0].empty()) {
-    DLOG << "Built a batch";
+    DVLOG << "Built a batch";
   }
   return batches;
 }
diff --git a/xllm/core/scheduler/pd_ooc_scheduler.cpp b/xllm/core/scheduler/pd_ooc_scheduler.cpp
diff --git a/xllm/core/scheduler/pd_ooc_scheduler.h b/xllm/core/scheduler/pd_ooc_scheduler.h
diff --git a/xllm/proto/disagg_pd.proto b/xllm/proto/disagg_pd.proto
diff --git a/xllm/proto/xservice.proto b/xllm/proto/xservice.proto