withcatai
diff --git a/‎.github/workflows/build.yml
Lines changed: 71 additions & 77 deletions b/‎.github/workflows/build.yml
Lines changed: 71 additions & 77 deletions
diff --git a/‎src/bindings/getLlama.ts
Lines changed: 19 additions & 5 deletions b/‎src/bindings/getLlama.ts
Lines changed: 19 additions & 5 deletions
diff --git a/‎src/bindings/utils/testBindingBinary.ts
Lines changed: 2 additions & 0 deletions b/‎src/bindings/utils/testBindingBinary.ts
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/cli/commands/inspect/commands/InspectGpuCommand.ts
Lines changed: 39 additions & 7 deletions b/‎src/cli/commands/inspect/commands/InspectGpuCommand.ts
Lines changed: 39 additions & 7 deletions
@@ -53,24 +53,17 @@ jobs:
       fail-fast: false
       matrix:
         config:
-          - name: "Windows MSVC"
+          - name: "Windows for x64"
+            os: windows-2019
+            artifact: "win-x64"
+          - name: "Windows for Arm"
             os: windows-2022
-            cc: "cl"
-            cxx: "cl"
-            environment_script: "C:/Program Files (x86)/Microsoft Visual Studio/2019/Enterprise/VC/Auxiliary/Build/vcvars64.bat"
-            generators: "Visual Studio 17 2022"
-            artifact: "win"
-          - name: "Ubuntu GCC"
-            os: ubuntu-22.04
-            cc: "gcc"
-            cxx: "g++"
-            generators: "Ninja"
+            artifact: "win-arm"
+          - name: "Ubuntu"
+            os: ubuntu-20.04
             artifact: "linux"
-          - name: "macOS Clang"
+          - name: "macOS"
             os: macos-13
-            cc: "clang"
-            cxx: "clang++"
-            generators: "Xcode"
             artifact: "mac"
 
     steps:
@@ -97,7 +90,7 @@ jobs:
           choco install ninja cmake
 
       - name: Install dependencies on Ubuntu
-        if: startsWith(matrix.config.name, 'Ubuntu GCC')
+        if: matrix.config.name == 'Ubuntu'
         run: |
           sudo apt-get update
           sudo apt-get install ninja-build cmake libtbb-dev g++-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf
@@ -108,24 +101,24 @@ jobs:
           which arm-linux-gnueabihf-gcc
           which arm-linux-gnueabihf-g++
 
-      - name: Install Cuda on Windows
-        if: startsWith(matrix.config.os, 'windows')
+      - name: Install Cuda on Windows for x64
+        if: matrix.config.name == 'Windows for x64'
         uses: Jimver/cuda-toolkit@v0.2.15
         with:
-          cuda: '12.4.1'
+          cuda: '12.2.0'
           method: 'network'
           sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
           use-local-cache: false
 
       - name: Install Cuda on Ubuntu
-        if: startsWith(matrix.config.name, 'Ubuntu GCC')
+        if: matrix.config.name == 'Ubuntu'
         uses: Jimver/cuda-toolkit@v0.2.15
         with:
-          cuda: '12.4.1'
+          cuda: '12.2.0'
           method: 'network'
 
-      - name: Install Vulkan SDK on Windows
-        if: startsWith(matrix.config.os, 'windows')
+      - name: Install Vulkan SDK on Windows for x64
+        if: matrix.config.name == 'Windows for x64'
         shell: powershell
         env:
           VULKAN_VERSION: 1.3.261.1
@@ -136,15 +129,15 @@ jobs:
           Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
 
       - name: Install Vulkan SDK on Ubuntu
-        if: startsWith(matrix.config.name, 'Ubuntu GCC')
+        if: matrix.config.name == 'Ubuntu'
         run: |
-          wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
-          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
+          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-focal.list https://packages.lunarg.com/vulkan/lunarg-vulkan-focal.list
           sudo apt update
           sudo apt install vulkan-sdk
 
       - name: Install dependencies on macOS
-        if: startsWith(matrix.config.os, 'macos')
+        if: matrix.config.name == 'macOS'
         run: |
           brew install cmake ninja
           alias make=cmake
@@ -203,10 +196,11 @@ jobs:
           }
           
           // build binaries
-          if (process.env.ARTIFACT_NAME === "win") {
+          if (process.env.ARTIFACT_NAME === "win-x64") {
             await buildBinary("x64", ["--gpu", "false"]);
             await buildBinary("x64", ["--gpu", "cuda"]);
             await buildBinary("x64", ["--gpu", "vulkan"]);
+          } else if (process.env.ARTIFACT_NAME === "win-arm") {
             await buildBinary("arm64", ["--gpu", "false"], windowsOnArmNodeVersion);
           } else if (process.env.ARTIFACT_NAME === "linux") {
             await buildBinary("x64", ["--gpu", "false"]);
@@ -234,53 +228,53 @@ jobs:
           
           EOF
 
-#      - name: Cache UPX
-#        id: cache-upx
-#        uses: actions/cache@v4
-#        with:
-#          path: "upxInstallations/**"
-#          key: cache-upx-${{ runner.os }}-${{ github.workflow }}
-#
-#      - name: Compress CUDA binary on Windows
-#        if: startsWith(matrix.config.os, 'windows')
-#        shell: bash
-#        env:
-#          UPX_VERSION: 4.2.4
-#        run: |
-#          mkdir -p upxInstallations
-#
-#          if [ ! -f "./upxInstallations/upx-${UPX_VERSION}-win64.zip" ]; then
-#            pushd upxInstallations
-#            curl -OL "https://github.com/upx/upx/releases/download/v${UPX_VERSION}/upx-${UPX_VERSION}-win64.zip"
-#            popd
-#          fi
-#
-#          mkdir -p upx
-#          unzip -d ./upx "./upxInstallations/upx-${UPX_VERSION}-win64.zip"
-#          mv "./upx/upx-${UPX_VERSION}-win64" ./upx/upx
-#
-#          ./upx/upx/upx.exe --best ./bins/win-x64-cuda/llama-addon.node
-#
-#      - name: Compress CUDA binary on Ubuntu
-#        if: startsWith(matrix.config.name, 'Ubuntu GCC')
-#        env:
-#          UPX_VERSION: 4.2.4
-#        run: |
-#          mkdir -p upxInstallations
-#
-#          if [ ! -f "./upxInstallations/upx-${UPX_VERSION}-amd64_linux.tar.xz" ]; then
-#            pushd upxInstallations
-#            curl -OL "https://github.com/upx/upx/releases/download/v${UPX_VERSION}/upx-${UPX_VERSION}-amd64_linux.tar.xz"
-#            popd
-#          fi
-#
-#          mkdir -p upx
-#          tar -xvf "./upxInstallations/upx-${UPX_VERSION}-amd64_linux.tar.xz" -C ./upx
-#          mv "./upx/upx-${UPX_VERSION}-amd64_linux" ./upx/upx
-#
-#          chmod +x ./bins/linux-x64-cuda/llama-addon.node
-#          ./upx/upx/upx --best ./bins/linux-x64-cuda/llama-addon.node
-#          chmod -x ./bins/linux-x64-cuda/llama-addon.node
+      - name: Cache UPX
+        id: cache-upx
+        uses: actions/cache@v4
+        with:
+          path: "upxInstallations/**"
+          key: cache-upx-${{ runner.os }}-${{ github.workflow }}
+
+      - name: Compress CUDA binary on Windows
+        if: matrix.config.name == 'Windows for x64'
+        shell: bash
+        env:
+          UPX_VERSION: 4.2.4
+        run: |
+          mkdir -p upxInstallations
+
+          if [ ! -f "./upxInstallations/upx-${UPX_VERSION}-win64.zip" ]; then
+            pushd upxInstallations
+            curl -OL "https://github.com/upx/upx/releases/download/v${UPX_VERSION}/upx-${UPX_VERSION}-win64.zip"
+            popd
+          fi
+
+          mkdir -p upx
+          unzip -d ./upx "./upxInstallations/upx-${UPX_VERSION}-win64.zip"
+          mv "./upx/upx-${UPX_VERSION}-win64" ./upx/upx
+
+          ./upx/upx/upx.exe --best ./bins/win-x64-cuda/llama-addon.node
+
+      - name: Compress CUDA binary on Ubuntu
+        if: matrix.config.name == 'Ubuntu'
+        env:
+          UPX_VERSION: 4.2.4
+        run: |
+          mkdir -p upxInstallations
+
+          if [ ! -f "./upxInstallations/upx-${UPX_VERSION}-amd64_linux.tar.xz" ]; then
+            pushd upxInstallations
+            curl -OL "https://github.com/upx/upx/releases/download/v${UPX_VERSION}/upx-${UPX_VERSION}-amd64_linux.tar.xz"
+            popd
+          fi
+
+          mkdir -p upx
+          tar -xvf "./upxInstallations/upx-${UPX_VERSION}-amd64_linux.tar.xz" -C ./upx
+          mv "./upx/upx-${UPX_VERSION}-amd64_linux" ./upx/upx
+
+          chmod +x ./bins/linux-x64-cuda/llama-addon.node
+          ./upx/upx/upx --best ./bins/linux-x64-cuda/llama-addon.node
+          chmod -x ./bins/linux-x64-cuda/llama-addon.node
 
       - name: Publish artifact
         uses: actions/upload-artifact@v4
@@ -510,7 +504,7 @@ jobs:
           - name: "Windows"
             os: windows-2022
           - name: "Ubuntu"
-            os: ubuntu-22.04
+            os: ubuntu-20.04
           - name: "macOS"
             os: macos-13
 
@@ -521,7 +515,7 @@ jobs:
           node-version: "20"
 
       - name: Install dependencies on Ubuntu
-        if: startsWith(matrix.config.name, 'Ubuntu')
+        if: matrix.config.name == 'Ubuntu'
         run: |
           sudo apt-get update
           sudo apt-get install libarchive-tools rpm
 
@@ -190,11 +190,14 @@ const defaultBuildOption: Exclude<LlamaOptions["build"], undefined> = runningInE
 
 /**
  * Get a `llama.cpp` binding.
- * Defaults to prefer a prebuilt binary, and fallbacks to building from source if a prebuilt binary is not found.
- * Pass `"lastCliBuild"` to default to use the last successful build created using the `download` or `build` CLI commands if one exists.
+ *
+ * Defaults to use a local binary built using the `download` or `build` CLI commands if one exists,
+ * otherwise, uses a prebuilt binary, and fallbacks to building from source if a prebuilt binary is not found.
+ *
+ * Pass `"lastBuild"` to default to use the last successful build created using the `download` or `build` CLI commands if one exists.
  */
-export async function getLlama(type: "lastBuild", lastBuildOptions?: LastBuildOptions): Promise<Llama>;
 export async function getLlama(options?: LlamaOptions): Promise<Llama>;
+export async function getLlama(type: "lastBuild", lastBuildOptions?: LastBuildOptions): Promise<Llama>;
 export async function getLlama(options?: LlamaOptions | "lastBuild", lastBuildOptions?: LastBuildOptions) {
     if (options === "lastBuild") {
         const lastBuildInfo = await getLastBuildInfo();
@@ -515,14 +518,22 @@ async function loadExistingLlamaBinary({
                         skipLlamaInit,
                         debug
                     });
-                } else if (progressLogs)
+                } else if (progressLogs) {
+                    const binaryDescription = describeBinary({
+                        ...buildOptions,
+                        customCmakeOptions: existingPrebuiltBinaryMustMatchBuildOptions
+                            ? buildOptions.customCmakeOptions
+                            : new Map()
+                    });
                     console.warn(
-                        getConsoleLogPrefix() + "The prebuilt binary is not compatible with the current system" + (
+                        getConsoleLogPrefix() +
+                        `The prebuilt ${binaryDescription} is not compatible with the current system` + (
                             fallbackMessage != null
                                 ? ", " + fallbackMessage
                                 : ""
                         )
                     );
+                }
             } catch (err) {
                 const binaryDescription = describeBinary({
                     ...buildOptions,
@@ -687,6 +698,9 @@ function getShouldTestBinaryBeforeLoading({
             platformInfo.version !== buildMetadata.buildOptions.platformInfo.version
         )
             return true;
+    } else if (platform === "win") {
+        if (buildMetadata.buildOptions.gpu !== false)
+            return true;
     }
 
     return false;
 
@@ -101,6 +101,8 @@ if (process.env.TEST_BINDING_CP === "true" && process.send != null) {
             try {
                 const binding: BindingModule = require(message.bindingBinaryPath);
                 await binding.init();
+                binding.getGpuVramInfo();
+                binding.getGpuDeviceInfo();
                 process.send({type: "done"} satisfies ChildToParentMessage);
             } catch (err) {
                 console.error(err);
 
@@ -10,6 +10,7 @@ import {getPrettyBuildGpuName} from "../../../../bindings/consts.js";
 import {getModuleVersion} from "../../../../utils/getModuleVersion.js";
 import {withCliCommandDescriptionDocsUrl} from "../../../utils/withCliCommandDescriptionDocsUrl.js";
 import {documentationPageUrls} from "../../../../config.js";
+import {Llama} from "../../../../bindings/Llama.js";
 
 type InspectGpuCommand = {
     // no options for now
@@ -26,6 +27,14 @@ export const InspectGpuCommand: CommandModule<object, InspectGpuCommand> = {
         const arch = process.arch;
         const availableComputeLayers = await detectAvailableComputeLayers({platform});
         const gpusToLogVramUsageOf: BuildGpu[] = [];
+        const gpuToLlama = new Map<BuildGpu, Llama | undefined>();
+
+        async function loadLlamaForGpu(gpu: BuildGpu) {
+            if (!gpuToLlama.has(gpu))
+                gpuToLlama.set(gpu, await getLlamaForGpu(gpu));
+
+            return gpuToLlama.get(gpu);
+        }
 
         console.info(`${chalk.yellow("OS:")} ${os.type()} ${os.release()} ${chalk.dim("(" + os.arch() + ")")}`);
 
@@ -62,28 +71,44 @@ export const InspectGpuCommand: CommandModule<object, InspectGpuCommand> = {
         } else if (availableComputeLayers.cuda.hasCudaRuntime && !availableComputeLayers.cuda.hasNvidiaDriver) {
             console.info(`${chalk.yellow("CUDA:")} ${chalk.red("CUDA runtime is installed, but NVIDIA driver is not")}`);
         } else if (availableComputeLayers.cuda.hasCudaRuntime && availableComputeLayers.cuda.hasNvidiaDriver) {
-            console.info(`${chalk.yellow("CUDA:")} ${chalk.green("available")}`);
-            gpusToLogVramUsageOf.push("cuda");
+            const llama = await loadLlamaForGpu("cuda");
+
+            if (llama == null)
+                console.info(`${chalk.yellow("CUDA:")} ${chalk.red("CUDA is detected, but using it failed")}`);
+            else {
+                console.info(`${chalk.yellow("CUDA:")} ${chalk.green("available")}`);
+                gpusToLogVramUsageOf.push("cuda");
+            }
         }
 
         if (availableComputeLayers.vulkan) {
-            console.info(`${chalk.yellow("Vulkan:")} ${chalk.green("available")}`);
-            gpusToLogVramUsageOf.push("vulkan");
+            const llama = await loadLlamaForGpu("vulkan");
+
+            if (llama == null)
+                console.info(`${chalk.yellow("Vulkan:")} ${chalk.red("Vulkan is detected, but using it failed")}`);
+            else {
+                console.info(`${chalk.yellow("Vulkan:")} ${chalk.green("available")}`);
+                gpusToLogVramUsageOf.push("vulkan");
+            }
         }
 
         for (const gpu of gpusToLogVramUsageOf) {
+            const llama = gpuToLlama.get(gpu);
+            if (llama == null)
+                continue;
+
             console.info();
-            await logGpuVramUsage(gpu);
+            await logGpuVramUsage(gpu, llama);
         }
 
         console.info();
         await logRamUsage();
     }
 };
 
-async function logGpuVramUsage(gpu: BuildGpu) {
+async function getLlamaForGpu(gpu: BuildGpu) {
     try {
-        const llama = await getLlamaForOptions({
+        return await getLlamaForOptions({
             gpu: gpu,
             build: "never",
             progressLogs: false,
@@ -92,6 +117,13 @@ async function logGpuVramUsage(gpu: BuildGpu) {
         }, {
             skipLlamaInit: true
         });
+    } catch (err) {
+        return undefined;
+    }
+}
+
+async function logGpuVramUsage(gpu: BuildGpu, llama: Llama) {
+    try {
         const gpuName = getPrettyBuildGpuName(gpu);
         const vramStatus = await llama.getVramState();
         const gpuDeviceNames = await llama.getGpuDeviceNames();