feat: compress CUDA prebuilt binaries (#236)

giladgd · web-flow · commit b89ad2d40622 · 2024-06-16T02:06:06.000+03:00
* feat: compress CUDA prebuilt binaries
* feat: automatically solve more CUDA compilation errors
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -126,6 +126,7 @@ jobs:
 
       - name: Install Vulkan SDK on Windows
         if: startsWith(matrix.config.os, 'windows')
+        shell: powershell
         env:
           VULKAN_VERSION: 1.3.261.1
         run: |
@@ -233,6 +234,54 @@ jobs:
           
           EOF
 
+      - name: Cache UPX
+        id: cache-upx
+        uses: actions/cache@v4
+        with:
+          path: "upxInstallations/**"
+          key: cache-upx-${{ runner.os }}-${{ github.workflow }}
+
+      - name: Compress CUDA binary on Windows
+        if: startsWith(matrix.config.os, 'windows')
+        shell: bash
+        env:
+          UPX_VERSION: 4.2.4
+        run: |
+          mkdir -p upxInstallations
+          
+          if [ ! -f "./upxInstallations/upx-${UPX_VERSION}-win64.zip" ]; then
+            pushd upxInstallations
+            curl -OL "https://github.com/upx/upx/releases/download/v${UPX_VERSION}/upx-${UPX_VERSION}-win64.zip"
+            popd
+          fi
+          
+          mkdir -p upx
+          unzip -d ./upx "./upxInstallations/upx-${UPX_VERSION}-win64.zip"
+          mv "./upx/upx-${UPX_VERSION}-win64" ./upx/upx
+          
+          ./upx/upx/upx.exe --best ./bins/win-x64-cuda/llama-addon.node
+
+      - name: Compress CUDA binary on Ubuntu
+        if: startsWith(matrix.config.name, 'Ubuntu GCC')
+        env:
+          UPX_VERSION: 4.2.4
+        run: |
+          mkdir -p upxInstallations
+          
+          if [ ! -f "./upxInstallations/upx-${UPX_VERSION}-amd64_linux.tar.xz" ]; then
+            pushd upxInstallations
+            curl -OL "https://github.com/upx/upx/releases/download/v${UPX_VERSION}/upx-${UPX_VERSION}-amd64_linux.tar.xz"
+            popd
+          fi
+          
+          mkdir -p upx
+          tar -xvf "./upxInstallations/upx-${UPX_VERSION}-amd64_linux.tar.xz" -C ./upx
+          mv "./upx/upx-${UPX_VERSION}-amd64_linux" ./upx/upx
+          
+          chmod +x ./bins/linux-x64-cuda/llama-addon.node
+          ./upx/upx/upx --best ./bins/linux-x64-cuda/llama-addon.node
+          chmod -x ./bins/linux-x64-cuda/llama-addon.node
+
       - name: Publish artifact
         uses: actions/upload-artifact@v4
         with:
diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts
@@ -192,9 +192,17 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                 chalk.yellow('To install Xcode command line tools, run "xcode-select --install"')
             );
         else if (buildOptions.gpu === "cuda") {
-            if (!ignoreWorkarounds.includes("cudaArchitecture") && (platform === "win" || platform === "linux") && err instanceof SpawnError &&
-                err.combinedStd.toLowerCase().includes("Failed to detect a default CUDA architecture".toLowerCase())
-            ) {
+            if (!ignoreWorkarounds.includes("cudaArchitecture") && (platform === "win" || platform === "linux") &&
+                err instanceof SpawnError && (
+                err.combinedStd.toLowerCase().includes("Failed to detect a default CUDA architecture".toLowerCase()) || (
+                    err.combinedStd.toLowerCase().includes(
+                        "Tell CMake where to find the compiler by setting either the environment".toLowerCase()
+                    ) &&
+                    err.combinedStd.toLowerCase().includes(
+                        'variable "CUDACXX" or the CMake cache entry CMAKE_CUDA_COMPILER to the full'.toLowerCase()
+                    )
+                )
+            )) {
                 for (const nvccPath of await getCudaNvccPaths()) {
                     if (buildOptions.progressLogs)
                         console.info(
diff --git a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts
@@ -102,7 +102,7 @@ describe("stableCode", () => {
                     expect(res.contextSize).to.toMatchInlineSnapshot("8687");
                 }
                 try {
-                    resolveGpuLayers(16, {
+                    await resolveGpuLayers(16, {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 0
                     });
@@ -111,7 +111,7 @@ describe("stableCode", () => {
                     expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
                 }
                 try {
-                    resolveGpuLayers(16, {
+                    await resolveGpuLayers(16, {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 0.2
                     });
@@ -165,7 +165,7 @@ describe("stableCode", () => {
                     expect(res.contextSize).to.toMatchInlineSnapshot("10905");
                 }
                 try {
-                    resolveGpuLayers(32, {
+                    await resolveGpuLayers(32, {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 0.2
                     });
@@ -214,7 +214,7 @@ describe("stableCode", () => {
                     expect(res.contextSize).to.toMatchInlineSnapshot("10905");
                 }
                 try {
-                    resolveGpuLayers(33, {
+                    await resolveGpuLayers(33, {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 0.2
                     });
@@ -255,7 +255,7 @@ describe("stableCode", () => {
 
             it('attempts to resolve "max"', async () => {
                 try {
-                    resolveGpuLayers("max", {
+                    await resolveGpuLayers("max", {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 0
                     });
@@ -265,7 +265,7 @@ describe("stableCode", () => {
                 }
 
                 try {
-                    resolveGpuLayers("max", {
+                    await resolveGpuLayers("max", {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 0.2
                     });
@@ -275,7 +275,7 @@ describe("stableCode", () => {
                 }
 
                 try {
-                    resolveGpuLayers("max", {
+                    await resolveGpuLayers("max", {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 3.2
                     });
@@ -467,7 +467,7 @@ describe("stableCode", () => {
                     expect(res.contextSize).to.toMatchInlineSnapshot("16384");
                 }
                 try {
-                    resolveGpuLayers({min: 2}, {
+                    await resolveGpuLayers({min: 2}, {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 0
                     });
@@ -476,7 +476,7 @@ describe("stableCode", () => {
                     expect(err).toMatchInlineSnapshot("[Error: Not enough VRAM to fit the model with the specified settings]");
                 }
                 try {
-                    resolveGpuLayers({min: 2, max: 4}, {
+                    await resolveGpuLayers({min: 2, max: 4}, {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 0
                     });
@@ -494,7 +494,7 @@ describe("stableCode", () => {
                     expect(res.contextSize).to.toMatchInlineSnapshot("13167");
                 }
                 try {
-                    resolveGpuLayers({min: 16}, {
+                    await resolveGpuLayers({min: 16}, {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 2
                     });
@@ -597,7 +597,7 @@ describe("stableCode", () => {
                 }
                 {
                     try {
-                        resolveGpuLayers({min: 1, fitContext: {contextSize: 8192}}, {
+                        await resolveGpuLayers({min: 1, fitContext: {contextSize: 8192}}, {
                             totalVram: s1GB * 0.2,
                             freeVram: s1GB * 0
                         });