AmpereComputingAI · MarcelWilnicki · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024
diff --git a/benchmarks/Meta-Llama-3-8B-Instruct.Q4_K_4.gguf@PP128@TG256.csv b/benchmarks/Meta-Llama-3-8B-Instruct.Q4_K_4.gguf@PP128@TG256.csv
@@ -0,0 +1,48 @@
+n_proc,n_threads,batch_size,prompt_size,output_tokens,pp_throughput_tps,pp_avg_latency_sec,tg_throughput_tps,tg_avg_latency_sec,pp+tg_throughput_tps,concurrency
+16,8,1,128,256,396.38615284214126,5.166812500000001,53.9083856577509,0.29706616210937503,70.1353850368713,16
+10,12,1,128,256,370.02208672228636,3.4593000000000003,60.55562550646733,0.16513789062499998,83.72397252807151,10
+8,16,1,128,256,384.57009450522344,2.66275,52.60371538348852,0.15346875000000004,61.203753511445825,8
+5,24,1,128,256,350.31630996146095,1.827,48.527500826418034,0.10303984375,67.38497174744674,5
+4,32,1,128,256,363.9239005060785,1.407,43.38138992258577,0.09220800781249999,60.9620574694396,4
+2,48,1,128,256,269.9215189310567,0.9484999999999999,44.06198301912035,0.045390625000000004,61.102713024106926,2
+2,64,1,128,256,321.4985100042571,0.7965,38.03764799362936,0.052583984375000004,53.44839585218178,2
+1,128,1,128,256,217.31748726655348,0.589,26.0905014268243,0.038328125,36.91952696856072,1
+16,8,2,128,256,418.19975883865374,9.794625000000002,110.64770308944307,0.28933642578125,139.80794611568743,32
+10,12,2,128,256,383.6810120647905,6.672500000000001,123.93800740714173,0.16137148437499998,159.67731875168928,20
+8,16,2,128,256,405.32717604607586,5.052875,95.94191098030699,0.16694531250000003,120.60774999018491,16
+5,24,2,128,256,370.0741462149616,3.4588,106.02687031013078,0.09431953125,138.05004314063848,10
+4,32,2,128,256,388.9014496996768,2.63325,89.76305574034251,0.0891298828125,119.70541246152048,8
+2,48,2,128,256,291.0064463943994,1.7595,69.88091980884391,0.057240234375000004,93.4932132205247,4
+2,64,2,128,256,357.67939408136175,1.4315,68.79871778810667,0.058140625,94.06577255190152,4
+1,128,2,128,256,268.6253934942288,0.953,47.66337739713275,0.0419609375,65.66908935442497,2
+16,8,4,128,256,417.81774681469346,19.6070625,198.07566817111712,0.32324731445312505,230.91884578161554,64
+10,12,4,128,256,370.82495128900973,13.809199999999999,200.5339634743387,0.199467578125,235.79981578139393,40
+8,16,4,128,256,410.30508573974623,9.982875,169.34048025895996,0.18912255859375002,198.97017390459538,32
+5,24,4,128,256,366.9544687265254,6.9768,180.1851591265522,0.11099921875,215.5971029139296,20
+4,32,4,128,256,392.2466482549231,5.2215,150.15846988520602,0.10655761718750001,187.8726722319053,16
+2,48,4,128,256,284.5256753784404,3.599,115.78471872089322,0.06909375000000001,144.2185812872635,8
+2,64,4,128,256,359.8034102940944,2.846,118.77514277579678,0.06735546875000001,152.3280606932117,8
+1,128,4,128,256,281.3186813186813,1.82,77.25969518635884,0.0517734375,101.89730662067136,4
+16,8,8,128,256,414.18260946754066,39.55800000000001,262.8345920588023,0.48716674804687504,287.23702664796633,128
+10,12,8,128,256,369.413996605541,27.72,247.47420827069718,0.32326640625,277.6798546519511,80
+8,16,8,128,256,406.2895840736078,20.163125,249.33713209113648,0.25785986328125,250.1603200293156,64
+5,24,8,128,256,362.30700183382214,14.131800000000002,224.36780736740155,0.17828125,255.81666472361474,40
+4,32,8,128,256,391.70590920021067,10.457,218.82873225516573,0.146515625,241.2439139312078,32
+2,48,8,128,256,292.82435876636123,6.994,155.60832128208014,0.10282226562499999,184.3273730949238,16
+2,64,8,128,256,359.96800722648663,5.689500000000001,172.949889329783,0.09251562499999999,207.99620840245097,16
+1,128,8,128,256,279.93439037725534,3.658,112.24377945851147,0.0712734375,140.24835646457268,8
+16,8,16,128,256,394.31195681747676,83.102625,220.8452042927132,1.159221923828125,255.87073197395068,256
+10,12,16,128,256,354.8340487251948,57.71869999999999,220.08028218556274,0.7270101562500001,251.46215584596305,160
+8,16,16,128,256,386.3011539159702,42.413250000000005,242.90285753371558,0.5270434570312499,271.0428798305982,128
+5,24,16,128,256,343.2492894093842,29.8336,219.71721397054426,0.3641046875,249.1160919913069,80
+4,32,16,128,256,368.9281498405472,22.2055,223.26757403084713,0.28668554687500003,254.80559875583202,64
+2,48,16,128,256,271.58209471225365,15.082,159.5917106959114,0.20051171875,184.9293421824913,32
+2,64,16,128,256,335.98882759548496,12.190999999999999,184.51845046619877,0.17343750000000002,215.52223099184425,32
+1,128,16,128,256,258.9127686472819,7.91,122.06824616301594,0.13107421875,148.17315808513203,16
+10,12,32,128,256,322.83991537277467,126.8774,151.25965729387573,2.1155941406250003,183.3408680939443,320
+8,16,32,128,256,346.5517507716242,94.55537500000003,176.17688788074838,1.453111328125,209.7223585231709,256
+5,24,32,128,256,308.451153580514,66.4,169.41941024156043,0.94440390625,198.67935144644,160
+4,32,32,128,256,326.4712519372915,50.186,175.4579368054558,0.7295205078125,207.09879664273433,128
+2,48,32,128,256,236.30887564930254,34.6665,127.3553249285536,0.50253125,150.41496315518887,64
+2,64,32,128,256,289.23018367609586,28.3245,146.60042212942176,0.436591796875,174.04482844091922,64
+1,128,32,128,256,219.80144888650386,18.635,101.9019541988531,0.31402734375,124.08862318986931,32
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -1,25 +1,53 @@
-# Running benchmark
+# Wrapper for multi-process / batched benchmark of llama.cpp
 
-This benchmarking tool runs multi-process, throughput-oriented benchmark of Ampere optimized llama.cpp using arbitrary model(s) provided by the user. 
-The benchmarking script spawns multiple parallel streams of token generation using llama.cpp and provides user with aggregate metrics of both prompt eval and token generation stages.
-Underneath, the _batched-bench_ script from upstream llama.cpp project is being used in an unaltered form.
-The script orchestrates the benchmark inside Docker container from the outside environment, **therefore this script should not be run inside Docker container.**
 
-## Setup
-Few dependencies need to be installed first. On Debian-based systems you can use the setup script.
+## ARM
+Instructions assume you have a debian based OS
 ```bash
+cd benchmarks
 sudo bash setup_deb.sh
+# vim download_models.sh # uncomment / add models you want to download
+bash download_models.sh
+# quick run
+sudo python3 run.py -m Meta-Llama-3-8B-Instruct.Q4_K_4.gguf Meta-Llama-3-8B-Instruct.Q8R16.gguf -t 128 -b 1 -p 128 -r 0-127 -d amperecomputingai/llama.cpp:latest
 ```
 
-## Downloading models
-Any GGUF model is expected to work, if you experience troubles running your network of choice please raise an [issue](https://github.com/AmpereComputingAI/llama.cpp/issues/new/choose).
-Benchmarking script expects models to be placed under _**llama.cpp/benchmarks/models**_ dir.
+## x86
+Instructions assume you have a debian based OS
 ```bash
-mkdir -p models
-huggingface-cli download QuantFactory/Meta-Llama-3-8B-Instruct-GGUF Meta-Llama-3-8B-Instruct.Q8_0.gguf --local-dir models --local-dir-use-symlinks False
+cd benchmarks
+sudo bash setup_deb.sh
+# vim download_models.sh # uncomment / add models you want to download
+bash download_models.sh
+
+cd utils
+sudo docker build -t llama_x86 .
+cd ..
+# quick run
+python3 run.py -m Meta-Llama-3-8B-Instruct.Q4_K_M.gguf Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 128 -b 1 -p 128 -r 0-127 -d llama_x86:latest
 ```
 
-## Benchmark
+Benchmarks will take a moment in default setting.
+After they complete you will find .csv files with results in the benchmarks directory of this repo.
+
+### results on Altra Max
+the results were gathered using amperecomputingai/llama.cpp:1.2.6 image with aio optimizations on an Altra Max.
+
+#### Meta-Llama-3-8B-Instruct.Q4_K_4.gguf
+
+| n_proc | n_threads | batch_size | prompt_size | output_tokens | total token generation capability, tps |
+|--------|-----------|------------|-------------|---------------|----------------------------------------|
+| 16     | 8         | 8          | 128         | 256           | 262.83                                 |
+
+
+#### Meta-Llama-3-8B-Instruct.Q8R16.gguf
+
+| n_proc | n_threads | batch_size | prompt_size | output_tokens | total token generation capability, tps |
+|--------|-----------|------------|-------------|---------------|----------------------------------------|
+| 10     | 12        | 16         | 128         | 256           | 294.23                                 |
+
+
+## run.py options
 Provide run.py Python script with following arguments:
 - -m, filename(s) of model(s) that should be available under _**llama.cpp/benchmarks/models**_ directory, multiple models can be provided
 - -t, threadpool(s) per single process, e.g., if there are 20 threads available on the system, if -t 10 is provided, 2 parallel processes will be spawned, each using 10 threads;
@@ -29,11 +57,4 @@ Provide run.py Python script with following arguments:
 - -r, thread-range, e.g., on an 80-thread system, it should be input as 0-79, unless user wants to use just a subset of available threads, say 16-63 (48 threads indexed 16<>63)
 ```bash
 python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 10 16 32 40 64 80 -b 1 2 4 8 16 32 64 -p 512 -r 0-79
-```
-
-## Quick run on 80t OCI A1 system
-```bash
-bash setup_deb.sh  # works on Debian-based systems
-bash download_models.sh  # uncomment preferred models in the file, by default llama3 q8_0 will be downloaded
-bash run.sh  # modify to adjust number of threads available and other parameters
-```
+```
diff --git a/benchmarks/download_models.sh b/benchmarks/download_models.sh
@@ -8,4 +8,7 @@ mkdir -p $SCRIPT_DIR/models
 #huggingface-cli download TheBloke/Llama-2-13B-GGUF llama-2-13b.Q8_0.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
 #huggingface-cli download TheBloke/Llama-2-70B-GGUF llama-2-70b.Q4_K_M.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
 huggingface-cli download QuantFactory/Meta-Llama-3-8B-Instruct-GGUF Meta-Llama-3-8B-Instruct.Q8_0.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
-#huggingface-cli download QuantFactory/Meta-Llama-3-8B-Instruct-GGUF Meta-Llama-3-8B-Instruct.Q4_K_M.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
+huggingface-cli download QuantFactory/Meta-Llama-3-8B-Instruct-GGUF Meta-Llama-3-8B-Instruct.Q4_K_M.gguf --local-dir $SCRIPT_DIR/models --local-dir-use-symlinks False
+
+wget -P models https://ampereaimodelzoo.s3.eu-central-1.amazonaws.com/gguf/Meta-Llama-3-8B-Instruct.Q4_K_4.gguf
+wget -P models https://ampereaimodelzoo.s3.eu-central-1.amazonaws.com/gguf/Meta-Llama-3-8B-Instruct.Q8R16.gguf
diff --git a/benchmarks/run.py b/benchmarks/run.py
@@ -11,19 +11,18 @@ def get_file_dir():
     return os.path.dirname(os.path.realpath(__file__))
 
 
-def docker_init():
-    tag = "amperecomputingai/llama.cpp:1.2.3"
-    if subprocess.run(
-            ["docker", "pull", tag]).returncode != 0:
-        print("Docker pull process failed!")
-        sys.exit(1)
+def docker_init(docker_image):
+    # if subprocess.run(
+    #        ["docker", "pull", docker_image]).returncode != 0:
+    #    print("Docker pull process failed!")
+    #    sys.exit(1)
     container_name = "llama_benchmark"
     subprocess.run(["docker", "rm", "-f", container_name])
     memory = (psutil.virtual_memory().total >> 30) - 30  # leave 30GB for OS
     assert memory > 10, "less than 10GB of memory available on the system for llama.cpp"
     if subprocess.run(
             ["docker", "run", "--privileged=true", "--name", container_name, "-d", "-m", f"{str(memory)}g", "-v",
-             f"{get_file_dir()}:/runner", "--entrypoint", "/bin/bash", "-it", tag]).returncode != 0:
+             f"{get_file_dir()}:/runner", "--entrypoint", "/bin/bash", "-it", docker_image]).returncode != 0:
         print("Docker run process failed!")
         sys.exit(1)
     return container_name
@@ -52,7 +51,8 @@ def docker_start():
 def benchmark(docker_container_name, args):
     num_available_threads = len(parse_threads_range(args.threads_range))
     if num_available_threads < max(args.num_threads):
-        print(f"Requested number of threads ({max(args.num_threads)}) exceeds threads available ({num_available_threads})")
+        print(
+            f"Requested number of threads ({max(args.num_threads)}) exceeds threads available ({num_available_threads})")
         sys.exit(1)
 
     docker_restart(docker_container_name)
@@ -63,11 +63,11 @@ def benchmark(docker_container_name, args):
                     num_processes = int(num_available_threads / num_threads)
                     case = f"{num_processes} x {num_threads} [proc x threads], bs = {batch_size}"
                     print(f"\nRunning {case}")
-    
+
                     cmd = (f"cd /runner; python3 utils/benchmark.py -m models/{model} -n {str(num_processes)} "
                            f"-t {str(num_threads)} -b {str(batch_size)} -p {str(prompt_size)} -r {args.threads_range}")
                     cmd = ["docker", "exec", "-i", docker_container_name, "bash", "-c", cmd]
-    
+
                     print(f"Executing: {' '.join(cmd)}")
                     success = False
                     start = time.time()
@@ -90,6 +90,9 @@ def parse_args():
     parser.add_argument("-m", "--model_names",
                         type=str, required=True, nargs="+",
                         help="model names, e.g. 'Meta-Llama-3-8B-Instruct.Q8_0.gguf'")
+    parser.add_argument("-d", "--docker_image",
+                        type=str, required=True,
+                        help="Docker image to use for benchmarking")
     parser.add_argument("-t", "--num_threads",
                         type=int, required=True, nargs="+",
                         help="number of threads per process to use")
@@ -111,8 +114,8 @@ def parse_args():
 
 def main():
     args = parse_args()
-    benchmark(docker_init(), args)
+    benchmark(docker_init(args.docker_image), args)
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/benchmarks/run.sh b/benchmarks/run.sh
diff --git a/benchmarks/utils/Dockerfile b/benchmarks/utils/Dockerfile
@@ -0,0 +1,7 @@
+FROM ubuntu:22.04
+ARG DEBIAN_FRONTEND=noninteractive
+RUN apt-get update -y && apt-get install -y build-essential cmake vim wget git numactl libopenblas-dev pkg-config python3 python3-pip libnuma-dev clang
+RUN mkdir /workspace
+RUN mkdir /llm
+RUN cd /workspace && git clone -b b3615 https://github.com/ggerganov/llama.cpp.git && cd llama.cpp && make -j && mv /workspace/llama.cpp/llama-batched-bench /llm/
+RUN rm -R /workspace
diff --git a/benchmarks/utils/benchmark.py b/benchmarks/utils/benchmark.py
@@ -117,8 +117,8 @@ def main():
     for n in range(args.num_processes):
         logfile = f"{logs_dir}/log_{n}"
         cmd = ["numactl", f"--physcpubind={gen_threads_config(args.num_threads, n)}",
-               "/llm/batched-bench", args.model, str(args.kv_cache), "2048", "512", "0", "0", "0", str(args.prompt_size), str(TOKENS),
-               str(args.batch_size), str(args.num_threads)]
+                "/llm/llama-batched-bench", "-m", args.model, "-c", str(args.kv_cache), "-b", "2048", "-ub", "512", "-npp", str(args.prompt_size), "-ntg", str(TOKENS),
+               "-npl", str(args.batch_size), "-t", str(args.num_threads), "-tb", str(args.num_threads), "-td", str(args.num_threads)]
         current_subprocesses.append(
             subprocess.Popen(cmd, stdout=open(logfile, 'wb'), stderr=open(logfile, 'wb')))
     start = time.time()
@@ -130,4 +130,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()