fix(Llama): expose the numa (#485)

giladgd · web-flow · commit ea0d8159c1f7 · 2025-07-30T01:04:11.000+03:00
* fix: expose the numa setting on the `Llama` instance
* fix: add `--numa` flag to cli commands
diff --git a/docs/guide/embedding.md b/docs/guide/embedding.md
@@ -172,7 +172,7 @@ const documents = [
     "Cleaning the house is a good way to keep it tidy"
 ];
 
-const query = "Tell me a nature geographical fact";
+const query = "Tell me a geographical fact";
 const rankedDocuments = await context.rankAndSort(query, documents);
 
 const topDocument = rankedDocuments[0]!;
@@ -185,7 +185,7 @@ console.log("Ranked documents:", rankedDocuments);
 ```
 > This example will produce this output:
 > ```
-> query: Tell me a nature geographical fact
+> query: Tell me a geographical fact
 > Top document: Mount Everest is the tallest mountain in the world
 > Second document: The capital of France is Paris
 > ```
diff --git a/src/bindings/Llama.ts b/src/bindings/Llama.ts
@@ -42,6 +42,7 @@ export class Llama {
     /** @internal */ public readonly _debug: boolean;
     /** @internal */ public readonly _threadsSplitter: ThreadsSplitter;
     /** @internal */ private readonly _gpu: LlamaGpuType;
+    /** @internal */ private readonly _numa: LlamaNuma;
     /** @internal */ private readonly _buildType: "localBuild" | "prebuilt";
     /** @internal */ private readonly _cmakeOptions: Readonly<Record<string, string>>;
     /** @internal */ private readonly _supportsGpuOffloading: boolean;
@@ -95,6 +96,7 @@ export class Llama {
 
         this._bindings = bindings;
         this._debug = debug;
+        this._numa = numa ?? false;
         this._logLevel = this._debug
             ? LlamaLogLevel.debug
             : (logLevel ?? LlamaLogLevel.debug);
@@ -111,7 +113,7 @@ export class Llama {
 
         bindings.ensureGpuDeviceIsSupported();
 
-        if (numa != null && numa !== false)
+        if (this._numa !== false)
             bindings.setNuma(numa);
 
         this._gpu = bindings.getGpuType() ?? false;
@@ -211,6 +213,13 @@ export class Llama {
         this._threadsSplitter.maxThreads = Math.floor(Math.max(0, value));
     }
 
+    /**
+     * See the `numa` option of `getLlama` for more information
+     */
+    public get numa() {
+        return this._numa;
+    }
+
     public get logLevel() {
         return this._logLevel;
     }
diff --git a/src/bindings/types.ts b/src/bindings/types.ts
@@ -22,6 +22,7 @@ export type BuildOptions = {
         release: string
     }
 };
+export const llamaNumaOptions = ["distribute", "isolate", "numactl", "mirror", false] as const satisfies LlamaNuma[];
 export type LlamaNuma = false | "distribute" | "isolate" | "numactl" | "mirror";
 
 export type BuildOptionsJSON = Omit<BuildOptions, "customCmakeOptions"> & {
@@ -44,6 +45,20 @@ export function parseNodeLlamaCppGpuOption(option: (typeof nodeLlamaCppGpuOption
     return "auto";
 }
 
+export function parseNumaOption(option: (typeof llamaNumaOptions)[number] | (typeof nodeLlamaCppGpuOffStringOptions)[number]): LlamaNuma {
+    function optionIsGpuOff(opt: typeof option): opt is (typeof nodeLlamaCppGpuOffStringOptions)[number] {
+        return nodeLlamaCppGpuOffStringOptions.includes(opt as (typeof nodeLlamaCppGpuOffStringOptions)[number]);
+    }
+
+    if (optionIsGpuOff(option))
+        return false;
+
+    if (llamaNumaOptions.includes(option))
+        return option;
+
+    return false;
+}
+
 
 export function convertBuildOptionsJSONToBuildOptions(buildOptionsJSON: BuildOptionsJSON): BuildOptions {
     return {
diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts
@@ -13,7 +13,8 @@ import {getLlama} from "../../bindings/getLlama.js";
 import {LlamaGrammar} from "../../evaluator/LlamaGrammar.js";
 import {LlamaChatSession} from "../../evaluator/LlamaChatSession/LlamaChatSession.js";
 import {
-    BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption
+    BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaNuma, llamaNumaOptions, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption,
+    parseNumaOption
 } from "../../bindings/types.js";
 import withOra from "../../utils/withOra.js";
 import {TokenMeter} from "../../evaluator/TokenMeter.js";
@@ -67,6 +68,7 @@ type ChatCommand = {
     tokenPredictionDraftModel?: string,
     tokenPredictionModelContextSize?: number,
     debug: boolean,
+    numa?: LlamaNuma,
     meter: boolean,
     timing: boolean,
     noMmap: boolean,
@@ -298,6 +300,20 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 default: false,
                 description: "Print llama.cpp info and debug logs"
             })
+            .option("numa", {
+                type: "string",
+
+                // yargs types don't support passing `false` as a choice, although it is supported by yargs
+                choices: llamaNumaOptions as any as Exclude<typeof llamaNumaOptions[number], false>[],
+                coerce: (value) => {
+                    if (value == null || value == "")
+                        return false;
+
+                    return parseNumaOption(value);
+                },
+                defaultDescription: "false",
+                description: "NUMA allocation policy. See the `numa` option on the `getLlama` method for more information"
+            })
             .option("meter", {
                 type: "boolean",
                 default: false,
@@ -326,7 +342,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
         noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory,
-        environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
+        environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
     }) {
         try {
             await RunChat({
@@ -335,7 +351,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 temperature, minP, topK, topP, seed,
                 gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
                 maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize,
-                debug, meter, timing, noMmap, printTimings
+                debug, numa, meter, timing, noMmap, printTimings
             });
         } catch (err) {
             await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -352,7 +368,7 @@ async function RunChat({
     jsonSchemaGrammarFile: jsonSchemaGrammarFilePath,
     threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
     repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel,
-    tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
+    tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
 }: ChatCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
@@ -369,11 +385,13 @@ async function RunChat({
         : LlamaLogLevel.warn;
     const llama = gpu == null
         ? await getLlama("lastBuild", {
-            logLevel: llamaLogLevel
+            logLevel: llamaLogLevel,
+            numa
         })
         : await getLlama({
             gpu,
-            logLevel: llamaLogLevel
+            logLevel: llamaLogLevel,
+            numa
         });
     const logBatchSize = batchSize != null;
     const useMmap = !noMmap && llama.supportsMmap;
diff --git a/src/cli/commands/CompleteCommand.ts b/src/cli/commands/CompleteCommand.ts
@@ -7,7 +7,8 @@ import fs from "fs-extra";
 import prettyMilliseconds from "pretty-ms";
 import {getLlama} from "../../bindings/getLlama.js";
 import {
-    BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption
+    BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaNuma, llamaNumaOptions, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption,
+    parseNumaOption
 } from "../../bindings/types.js";
 import {LlamaCompletion} from "../../evaluator/LlamaCompletion.js";
 import withOra from "../../utils/withOra.js";
@@ -49,6 +50,7 @@ type CompleteCommand = {
     tokenPredictionDraftModel?: string,
     tokenPredictionModelContextSize?: number,
     debug: boolean,
+    numa?: LlamaNuma,
     meter: boolean,
     timing: boolean,
     noMmap: boolean,
@@ -218,6 +220,20 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
                 default: false,
                 description: "Print llama.cpp info and debug logs"
             })
+            .option("numa", {
+                type: "string",
+
+                // yargs types don't support passing `false` as a choice, although it is supported by yargs
+                choices: llamaNumaOptions as any as Exclude<typeof llamaNumaOptions[number], false>[],
+                coerce: (value) => {
+                    if (value == null || value == "")
+                        return false;
+
+                    return parseNumaOption(value);
+                },
+                defaultDescription: "false",
+                description: "NUMA allocation policy. See the `numa` option on the `getLlama` method for more information"
+            })
             .option("meter", {
                 type: "boolean",
                 default: false,
@@ -245,14 +261,14 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
         flashAttention, swaFullCache, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
-        debug, meter, timing, noMmap, printTimings
+        debug, numa, meter, timing, noMmap, printTimings
     }) {
         try {
             await RunCompletion({
                 modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
                 threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
                 repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
-                tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
+                tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
             });
         } catch (err) {
             await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -267,7 +283,7 @@ async function RunCompletion({
     modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
     threads, temperature, minP, topK, topP, seed, gpuLayers,
     lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
-    tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, noMmap, printTimings
+    tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, printTimings
 }: CompleteCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
@@ -282,11 +298,13 @@ async function RunCompletion({
         : LlamaLogLevel.warn;
     const llama = gpu == null
         ? await getLlama("lastBuild", {
-            logLevel: llamaLogLevel
+            logLevel: llamaLogLevel,
+            numa
         })
         : await getLlama({
             gpu,
-            logLevel: llamaLogLevel
+            logLevel: llamaLogLevel,
+            numa
         });
     const logBatchSize = batchSize != null;
     const useMmap = !noMmap && llama.supportsMmap;
diff --git a/src/cli/commands/InfillCommand.ts b/src/cli/commands/InfillCommand.ts
@@ -7,7 +7,8 @@ import fs from "fs-extra";
 import prettyMilliseconds from "pretty-ms";
 import {getLlama} from "../../bindings/getLlama.js";
 import {
-    BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption
+    BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaNuma, llamaNumaOptions, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption,
+    parseNumaOption
 } from "../../bindings/types.js";
 import {LlamaCompletion} from "../../evaluator/LlamaCompletion.js";
 import withOra from "../../utils/withOra.js";
@@ -51,6 +52,7 @@ type InfillCommand = {
     tokenPredictionDraftModel?: string,
     tokenPredictionModelContextSize?: number,
     debug: boolean,
+    numa?: LlamaNuma,
     meter: boolean,
     timing: boolean,
     noMmap: boolean,
@@ -228,6 +230,20 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
                 default: false,
                 description: "Print llama.cpp info and debug logs"
             })
+            .option("numa", {
+                type: "string",
+
+                // yargs types don't support passing `false` as a choice, although it is supported by yargs
+                choices: llamaNumaOptions as any as Exclude<typeof llamaNumaOptions[number], false>[],
+                coerce: (value) => {
+                    if (value == null || value == "")
+                        return false;
+
+                    return parseNumaOption(value);
+                },
+                defaultDescription: "false",
+                description: "NUMA allocation policy. See the `numa` option on the `getLlama` method for more information"
+            })
             .option("meter", {
                 type: "boolean",
                 default: false,
@@ -255,14 +271,14 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
         flashAttention, swaFullCache, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
-        debug, meter, timing, noMmap, printTimings
+        debug, numa, meter, timing, noMmap, printTimings
     }) {
         try {
             await RunInfill({
                 modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
                 swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
                 repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
-                tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
+                tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
             });
         } catch (err) {
             await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -277,7 +293,7 @@ async function RunInfill({
     modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
     swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers,
     lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
-    tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, noMmap, printTimings
+    tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, printTimings
 }: InfillCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
@@ -292,11 +308,13 @@ async function RunInfill({
         : LlamaLogLevel.warn;
     const llama = gpu == null
         ? await getLlama("lastBuild", {
-            logLevel: llamaLogLevel
+            logLevel: llamaLogLevel,
+            numa
         })
         : await getLlama({
             gpu,
-            logLevel: llamaLogLevel
+            logLevel: llamaLogLevel,
+            numa
         });
     const logBatchSize = batchSize != null;
     const useMmap = !noMmap && llama.supportsMmap;
diff --git a/test/modelDependent/bgeReranker/rank.test.ts b/test/modelDependent/bgeReranker/rank.test.ts