AmpereComputingAI · MarcelWilnicki · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -1,39 +1,32 @@
-# Running benchmark
+![Ampere AI](https://ampereaimodelzoo.s3.eu-central-1.amazonaws.com/ampere_logo_®_primary_stacked_rgb.png "Ampere AI")
+# Wrapper for multi-process / batched benchmark of llama.cpp
 
-This benchmarking tool runs multi-process, throughput-oriented benchmark of Ampere optimized llama.cpp using arbitrary model(s) provided by the user. 
-The benchmarking script spawns multiple parallel streams of token generation using llama.cpp and provides user with aggregate metrics of both prompt eval and token generation stages.
-Underneath, the _batched-bench_ script from upstream llama.cpp project is being used in an unaltered form.
-The script orchestrates the benchmark inside Docker container from the outside environment, **therefore this script should not be run inside Docker container.**
 
-## Setup
-Few dependencies need to be installed first. On Debian-based systems you can use the setup script.
+## ARM
+Instructions assume you have a debian based OS
 ```bash
+cd benchmarks
 sudo bash setup_deb.sh
+# vim download_models.sh # uncomment / add models you want to download
+bash download_models.sh
+# vim run.sh # modify run.sh (provide the name of the docker image, threads, batch sizes etc.)
+nohup sudo bash run.sh
 ```
 
-## Downloading models
-Any GGUF model is expected to work, if you experience troubles running your network of choice please raise an [issue](https://github.com/AmpereComputingAI/llama.cpp/issues/new/choose).
-Benchmarking script expects models to be placed under _**llama.cpp/benchmarks/models**_ dir.
+## x86
+Instructions assume you have a debian based OS
 ```bash
-mkdir -p models
-huggingface-cli download QuantFactory/Meta-Llama-3-8B-Instruct-GGUF Meta-Llama-3-8B-Instruct.Q8_0.gguf --local-dir models --local-dir-use-symlinks False
-```
+cd benchmarks
+sudo bash setup_deb.sh
+# vim download_models.sh # uncomment / add models you want to download
+bash download_models.sh
 
-## Benchmark
-Provide run.py Python script with following arguments:
-- -m, filename(s) of model(s) that should be available under _**llama.cpp/benchmarks/models**_ directory, multiple models can be provided
-- -t, threadpool(s) per single process, e.g., if there are 20 threads available on the system, if -t 10 is provided, 2 parallel processes will be spawned, each using 10 threads;
-  multiple threadpools can be provided and they will be treated as separate cases to benchmark
-- -b, batch size(s) to benchmark, meaning separate token generation streams handled as a single batch; multiple batch sizes can be provided and they will be treated as separate cases to benchmark
-- -p, prompt size(s) to benchmark, size of an input prompt; multiple prompt sizes can be provided and they will be treated as separate cases to benchmark
-- -r, thread-range, e.g., on an 80-thread system, it should be input as 0-79, unless user wants to use just a subset of available threads, say 16-63 (48 threads indexed 16<>63)
-```bash
-python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 10 16 32 40 64 80 -b 1 2 4 8 16 32 64 -p 512 -r 0-79
+cd utils
+sudo docker build -t llama_x86 .
+cd ..
+# vim run.sh # modify run.sh (provide the name of the docker image, threads, batch sizes etc.)
+nohup sudo bash run.sh
 ```
 
-## Quick run on 80t OCI A1 system
-```bash
-bash setup_deb.sh  # works on Debian-based systems
-bash download_models.sh  # uncomment preferred models in the file, by default llama3 q8_0 will be downloaded
-bash run.sh  # modify to adjust number of threads available and other parameters
-```
+Benchmarks will take few hours in default setting, going over various combinations of n_proc x n_threads x batch_size x prompt_size x model_size 😵‍💫
+After they complete you will find .csv files with results in the benchmarks directory of this repo.
diff --git a/benchmarks/run.py b/benchmarks/run.py
@@ -11,19 +11,18 @@ def get_file_dir():
     return os.path.dirname(os.path.realpath(__file__))
 
 
-def docker_init():
-    tag = "amperecomputingai/llama.cpp:1.2.3"
-    if subprocess.run(
-            ["docker", "pull", tag]).returncode != 0:
-        print("Docker pull process failed!")
-        sys.exit(1)
+def docker_init(docker_image):
+    # if subprocess.run(
+    #        ["docker", "pull", docker_image]).returncode != 0:
+    #    print("Docker pull process failed!")
+    #    sys.exit(1)
     container_name = "llama_benchmark"
     subprocess.run(["docker", "rm", "-f", container_name])
     memory = (psutil.virtual_memory().total >> 30) - 30  # leave 30GB for OS
     assert memory > 10, "less than 10GB of memory available on the system for llama.cpp"
     if subprocess.run(
             ["docker", "run", "--privileged=true", "--name", container_name, "-d", "-m", f"{str(memory)}g", "-v",
-             f"{get_file_dir()}:/runner", "--entrypoint", "/bin/bash", "-it", tag]).returncode != 0:
+             f"{get_file_dir()}:/runner", "--entrypoint", "/bin/bash", "-it", docker_image]).returncode != 0:
         print("Docker run process failed!")
         sys.exit(1)
     return container_name
@@ -52,7 +51,8 @@ def docker_start():
 def benchmark(docker_container_name, args):
     num_available_threads = len(parse_threads_range(args.threads_range))
     if num_available_threads < max(args.num_threads):
-        print(f"Requested number of threads ({max(args.num_threads)}) exceeds threads available ({num_available_threads})")
+        print(
+            f"Requested number of threads ({max(args.num_threads)}) exceeds threads available ({num_available_threads})")
         sys.exit(1)
 
     docker_restart(docker_container_name)
@@ -63,11 +63,11 @@ def benchmark(docker_container_name, args):
                     num_processes = int(num_available_threads / num_threads)
                     case = f"{num_processes} x {num_threads} [proc x threads], bs = {batch_size}"
                     print(f"\nRunning {case}")
-    
+
                     cmd = (f"cd /runner; python3 utils/benchmark.py -m models/{model} -n {str(num_processes)} "
                            f"-t {str(num_threads)} -b {str(batch_size)} -p {str(prompt_size)} -r {args.threads_range}")
                     cmd = ["docker", "exec", "-i", docker_container_name, "bash", "-c", cmd]
-    
+
                     print(f"Executing: {' '.join(cmd)}")
                     success = False
                     start = time.time()
@@ -90,6 +90,9 @@ def parse_args():
     parser.add_argument("-m", "--model_names",
                         type=str, required=True, nargs="+",
                         help="model names, e.g. 'Meta-Llama-3-8B-Instruct.Q8_0.gguf'")
+    parser.add_argument("-d", "--docker_image",
+                        type=str, required=True,
+                        help="Docker image to use for benchmarking")
     parser.add_argument("-t", "--num_threads",
                         type=int, required=True, nargs="+",
                         help="number of threads per process to use")
@@ -111,8 +114,8 @@ def parse_args():
 
 def main():
     args = parse_args()
-    benchmark(docker_init(), args)
+    benchmark(docker_init(args.docker_image), args)
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/benchmarks/run.sh b/benchmarks/run.sh
@@ -1,4 +1,4 @@
 set -e
 
-python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 10 16 32 40 64 80 -b 1 2 4 8 16 32 64 -p 512 -r 0-79
+python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 8 12 16 24 48 -b 1 2 4 8 16 32 -p 128 -r 0-47 -d amperecomputingai/llama.cpp:latest
 rm -f /tmp/log_power
diff --git a/benchmarks/utils/Dockerfile b/benchmarks/utils/Dockerfile
@@ -0,0 +1,7 @@
+FROM ubuntu:22.04
+ARG DEBIAN_FRONTEND=noninteractive
+RUN apt-get update -y && apt-get install -y build-essential cmake vim wget git numactl libopenblas-dev pkg-config python3 python3-pip libnuma-dev clang
+RUN mkdir /workspace
+RUN mkdir /llm
+RUN cd /workspace && git clone -b b3615 https://github.com/ggerganov/llama.cpp.git && cd llama.cpp && make -j && mv /workspace/llama.cpp/llama-batched-bench /llm/
+RUN rm -R /workspace
diff --git a/benchmarks/utils/benchmark.py b/benchmarks/utils/benchmark.py
@@ -117,8 +117,8 @@ def main():
     for n in range(args.num_processes):
         logfile = f"{logs_dir}/log_{n}"
         cmd = ["numactl", f"--physcpubind={gen_threads_config(args.num_threads, n)}",
-               "/llm/batched-bench", args.model, str(args.kv_cache), "2048", "512", "0", "0", "0", str(args.prompt_size), str(TOKENS),
-               str(args.batch_size), str(args.num_threads)]
+                "/llm/llama-batched-bench", "-m", args.model, "-c", str(args.kv_cache), "-b", "2048", "-ub", "512", "-npp", str(args.prompt_size), "-ntg", str(TOKENS),
+               "-npl", str(args.batch_size), "-t", str(args.num_threads), "-tb", str(args.num_threads), "-td", str(args.num_threads)]
         current_subprocesses.append(
             subprocess.Popen(cmd, stdout=open(logfile, 'wb'), stderr=open(logfile, 'wb')))
     start = time.time()
@@ -130,4 +130,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()