Skip to content

Commit ea0d815

Browse files
authored
fix(Llama): expose the numa (#485)
* fix: expose the numa setting on the `Llama` instance * fix: add `--numa` flag to cli commands
1 parent 5565614 commit ea0d815

File tree

7 files changed

+178
-42
lines changed

7 files changed

+178
-42
lines changed

docs/guide/embedding.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ const documents = [
172172
"Cleaning the house is a good way to keep it tidy"
173173
];
174174

175-
const query = "Tell me a nature geographical fact";
175+
const query = "Tell me a geographical fact";
176176
const rankedDocuments = await context.rankAndSort(query, documents);
177177

178178
const topDocument = rankedDocuments[0]!;
@@ -185,7 +185,7 @@ console.log("Ranked documents:", rankedDocuments);
185185
```
186186
> This example will produce this output:
187187
> ```
188-
> query: Tell me a nature geographical fact
188+
> query: Tell me a geographical fact
189189
> Top document: Mount Everest is the tallest mountain in the world
190190
> Second document: The capital of France is Paris
191191
> ```

src/bindings/Llama.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ export class Llama {
4242
/** @internal */ public readonly _debug: boolean;
4343
/** @internal */ public readonly _threadsSplitter: ThreadsSplitter;
4444
/** @internal */ private readonly _gpu: LlamaGpuType;
45+
/** @internal */ private readonly _numa: LlamaNuma;
4546
/** @internal */ private readonly _buildType: "localBuild" | "prebuilt";
4647
/** @internal */ private readonly _cmakeOptions: Readonly<Record<string, string>>;
4748
/** @internal */ private readonly _supportsGpuOffloading: boolean;
@@ -95,6 +96,7 @@ export class Llama {
9596

9697
this._bindings = bindings;
9798
this._debug = debug;
99+
this._numa = numa ?? false;
98100
this._logLevel = this._debug
99101
? LlamaLogLevel.debug
100102
: (logLevel ?? LlamaLogLevel.debug);
@@ -111,7 +113,7 @@ export class Llama {
111113

112114
bindings.ensureGpuDeviceIsSupported();
113115

114-
if (numa != null && numa !== false)
116+
if (this._numa !== false)
115117
bindings.setNuma(numa);
116118

117119
this._gpu = bindings.getGpuType() ?? false;
@@ -211,6 +213,13 @@ export class Llama {
211213
this._threadsSplitter.maxThreads = Math.floor(Math.max(0, value));
212214
}
213215

216+
/**
217+
* See the `numa` option of `getLlama` for more information
218+
*/
219+
public get numa() {
220+
return this._numa;
221+
}
222+
214223
public get logLevel() {
215224
return this._logLevel;
216225
}

src/bindings/types.ts

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ export type BuildOptions = {
2222
release: string
2323
}
2424
};
25+
export const llamaNumaOptions = ["distribute", "isolate", "numactl", "mirror", false] as const satisfies LlamaNuma[];
2526
export type LlamaNuma = false | "distribute" | "isolate" | "numactl" | "mirror";
2627

2728
export type BuildOptionsJSON = Omit<BuildOptions, "customCmakeOptions"> & {
@@ -44,6 +45,20 @@ export function parseNodeLlamaCppGpuOption(option: (typeof nodeLlamaCppGpuOption
4445
return "auto";
4546
}
4647

48+
export function parseNumaOption(option: (typeof llamaNumaOptions)[number] | (typeof nodeLlamaCppGpuOffStringOptions)[number]): LlamaNuma {
49+
function optionIsGpuOff(opt: typeof option): opt is (typeof nodeLlamaCppGpuOffStringOptions)[number] {
50+
return nodeLlamaCppGpuOffStringOptions.includes(opt as (typeof nodeLlamaCppGpuOffStringOptions)[number]);
51+
}
52+
53+
if (optionIsGpuOff(option))
54+
return false;
55+
56+
if (llamaNumaOptions.includes(option))
57+
return option;
58+
59+
return false;
60+
}
61+
4762

4863
export function convertBuildOptionsJSONToBuildOptions(buildOptionsJSON: BuildOptionsJSON): BuildOptions {
4964
return {

src/cli/commands/ChatCommand.ts

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ import {getLlama} from "../../bindings/getLlama.js";
1313
import {LlamaGrammar} from "../../evaluator/LlamaGrammar.js";
1414
import {LlamaChatSession} from "../../evaluator/LlamaChatSession/LlamaChatSession.js";
1515
import {
16-
BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption
16+
BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaNuma, llamaNumaOptions, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption,
17+
parseNumaOption
1718
} from "../../bindings/types.js";
1819
import withOra from "../../utils/withOra.js";
1920
import {TokenMeter} from "../../evaluator/TokenMeter.js";
@@ -67,6 +68,7 @@ type ChatCommand = {
6768
tokenPredictionDraftModel?: string,
6869
tokenPredictionModelContextSize?: number,
6970
debug: boolean,
71+
numa?: LlamaNuma,
7072
meter: boolean,
7173
timing: boolean,
7274
noMmap: boolean,
@@ -298,6 +300,20 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
298300
default: false,
299301
description: "Print llama.cpp info and debug logs"
300302
})
303+
.option("numa", {
304+
type: "string",
305+
306+
// yargs types don't support passing `false` as a choice, although it is supported by yargs
307+
choices: llamaNumaOptions as any as Exclude<typeof llamaNumaOptions[number], false>[],
308+
coerce: (value) => {
309+
if (value == null || value == "")
310+
return false;
311+
312+
return parseNumaOption(value);
313+
},
314+
defaultDescription: "false",
315+
description: "NUMA allocation policy. See the `numa` option on the `getLlama` method for more information"
316+
})
301317
.option("meter", {
302318
type: "boolean",
303319
default: false,
@@ -326,7 +342,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
326342
noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
327343
topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
328344
repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory,
329-
environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
345+
environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
330346
}) {
331347
try {
332348
await RunChat({
@@ -335,7 +351,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
335351
temperature, minP, topK, topP, seed,
336352
gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
337353
maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize,
338-
debug, meter, timing, noMmap, printTimings
354+
debug, numa, meter, timing, noMmap, printTimings
339355
});
340356
} catch (err) {
341357
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -352,7 +368,7 @@ async function RunChat({
352368
jsonSchemaGrammarFile: jsonSchemaGrammarFilePath,
353369
threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
354370
repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel,
355-
tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
371+
tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
356372
}: ChatCommand) {
357373
if (contextSize === -1) contextSize = undefined;
358374
if (gpuLayers === -1) gpuLayers = undefined;
@@ -369,11 +385,13 @@ async function RunChat({
369385
: LlamaLogLevel.warn;
370386
const llama = gpu == null
371387
? await getLlama("lastBuild", {
372-
logLevel: llamaLogLevel
388+
logLevel: llamaLogLevel,
389+
numa
373390
})
374391
: await getLlama({
375392
gpu,
376-
logLevel: llamaLogLevel
393+
logLevel: llamaLogLevel,
394+
numa
377395
});
378396
const logBatchSize = batchSize != null;
379397
const useMmap = !noMmap && llama.supportsMmap;

src/cli/commands/CompleteCommand.ts

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ import fs from "fs-extra";
77
import prettyMilliseconds from "pretty-ms";
88
import {getLlama} from "../../bindings/getLlama.js";
99
import {
10-
BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption
10+
BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaNuma, llamaNumaOptions, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption,
11+
parseNumaOption
1112
} from "../../bindings/types.js";
1213
import {LlamaCompletion} from "../../evaluator/LlamaCompletion.js";
1314
import withOra from "../../utils/withOra.js";
@@ -49,6 +50,7 @@ type CompleteCommand = {
4950
tokenPredictionDraftModel?: string,
5051
tokenPredictionModelContextSize?: number,
5152
debug: boolean,
53+
numa?: LlamaNuma,
5254
meter: boolean,
5355
timing: boolean,
5456
noMmap: boolean,
@@ -218,6 +220,20 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
218220
default: false,
219221
description: "Print llama.cpp info and debug logs"
220222
})
223+
.option("numa", {
224+
type: "string",
225+
226+
// yargs types don't support passing `false` as a choice, although it is supported by yargs
227+
choices: llamaNumaOptions as any as Exclude<typeof llamaNumaOptions[number], false>[],
228+
coerce: (value) => {
229+
if (value == null || value == "")
230+
return false;
231+
232+
return parseNumaOption(value);
233+
},
234+
defaultDescription: "false",
235+
description: "NUMA allocation policy. See the `numa` option on the `getLlama` method for more information"
236+
})
221237
.option("meter", {
222238
type: "boolean",
223239
default: false,
@@ -245,14 +261,14 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
245261
flashAttention, swaFullCache, threads, temperature, minP, topK,
246262
topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
247263
repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
248-
debug, meter, timing, noMmap, printTimings
264+
debug, numa, meter, timing, noMmap, printTimings
249265
}) {
250266
try {
251267
await RunCompletion({
252268
modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
253269
threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
254270
repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
255-
tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
271+
tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
256272
});
257273
} catch (err) {
258274
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -267,7 +283,7 @@ async function RunCompletion({
267283
modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
268284
threads, temperature, minP, topK, topP, seed, gpuLayers,
269285
lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
270-
tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, noMmap, printTimings
286+
tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, printTimings
271287
}: CompleteCommand) {
272288
if (contextSize === -1) contextSize = undefined;
273289
if (gpuLayers === -1) gpuLayers = undefined;
@@ -282,11 +298,13 @@ async function RunCompletion({
282298
: LlamaLogLevel.warn;
283299
const llama = gpu == null
284300
? await getLlama("lastBuild", {
285-
logLevel: llamaLogLevel
301+
logLevel: llamaLogLevel,
302+
numa
286303
})
287304
: await getLlama({
288305
gpu,
289-
logLevel: llamaLogLevel
306+
logLevel: llamaLogLevel,
307+
numa
290308
});
291309
const logBatchSize = batchSize != null;
292310
const useMmap = !noMmap && llama.supportsMmap;

src/cli/commands/InfillCommand.ts

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ import fs from "fs-extra";
77
import prettyMilliseconds from "pretty-ms";
88
import {getLlama} from "../../bindings/getLlama.js";
99
import {
10-
BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption
10+
BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaNuma, llamaNumaOptions, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption,
11+
parseNumaOption
1112
} from "../../bindings/types.js";
1213
import {LlamaCompletion} from "../../evaluator/LlamaCompletion.js";
1314
import withOra from "../../utils/withOra.js";
@@ -51,6 +52,7 @@ type InfillCommand = {
5152
tokenPredictionDraftModel?: string,
5253
tokenPredictionModelContextSize?: number,
5354
debug: boolean,
55+
numa?: LlamaNuma,
5456
meter: boolean,
5557
timing: boolean,
5658
noMmap: boolean,
@@ -228,6 +230,20 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
228230
default: false,
229231
description: "Print llama.cpp info and debug logs"
230232
})
233+
.option("numa", {
234+
type: "string",
235+
236+
// yargs types don't support passing `false` as a choice, although it is supported by yargs
237+
choices: llamaNumaOptions as any as Exclude<typeof llamaNumaOptions[number], false>[],
238+
coerce: (value) => {
239+
if (value == null || value == "")
240+
return false;
241+
242+
return parseNumaOption(value);
243+
},
244+
defaultDescription: "false",
245+
description: "NUMA allocation policy. See the `numa` option on the `getLlama` method for more information"
246+
})
231247
.option("meter", {
232248
type: "boolean",
233249
default: false,
@@ -255,14 +271,14 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
255271
flashAttention, swaFullCache, threads, temperature, minP, topK,
256272
topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
257273
repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
258-
debug, meter, timing, noMmap, printTimings
274+
debug, numa, meter, timing, noMmap, printTimings
259275
}) {
260276
try {
261277
await RunInfill({
262278
modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
263279
swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
264280
repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
265-
tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
281+
tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
266282
});
267283
} catch (err) {
268284
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -277,7 +293,7 @@ async function RunInfill({
277293
modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
278294
swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers,
279295
lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
280-
tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, noMmap, printTimings
296+
tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, printTimings
281297
}: InfillCommand) {
282298
if (contextSize === -1) contextSize = undefined;
283299
if (gpuLayers === -1) gpuLayers = undefined;
@@ -292,11 +308,13 @@ async function RunInfill({
292308
: LlamaLogLevel.warn;
293309
const llama = gpu == null
294310
? await getLlama("lastBuild", {
295-
logLevel: llamaLogLevel
311+
logLevel: llamaLogLevel,
312+
numa
296313
})
297314
: await getLlama({
298315
gpu,
299-
logLevel: llamaLogLevel
316+
logLevel: llamaLogLevel,
317+
numa
300318
});
301319
const logBatchSize = batchSize != null;
302320
const useMmap = !noMmap && llama.supportsMmap;

0 commit comments

Comments
 (0)