Skip to content

TPRD-1638: adding missed argument #8312

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jul 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 27 additions & 34 deletions Dockerfile.sdk
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
#

# Base image on the minimum Triton container
ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:25.06-py3-min
ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:25.07-py3-min

ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo
ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo
Expand Down Expand Up @@ -60,44 +60,35 @@ ENV PIP_BREAK_SYSTEM_PACKAGES=1

RUN apt-get update && \
apt-get install -y --no-install-recommends \
ca-certificates \
software-properties-common \
autoconf \
automake \
build-essential \
ca-certificates \
curl \
git \
gperf \
libb64-dev \
libgoogle-perftools-dev \
libopencv-dev \
libopencv-core-dev \
libopencv-dev \
libssl-dev \
libtool \
maven \
openjdk-11-jdk \
pkg-config \
python3 \
python3-pip \
python3-dev \
python3-wheel \
python3-pdfkit \
python3-pip \
python3-setuptools \
python3-wheel \
rapidjson-dev \
software-properties-common \
vim \
wget \
python3-pdfkit \
openjdk-11-jdk \
maven && \
pip3 install --upgrade "grpcio-tools<1.68"

# Client build requires recent version of CMake (FetchContent required)
# Using CMAKE installation instruction from:: https://apt.kitware.com/
RUN apt update -q=2 \
&& apt install -y gpg wget \
&& wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \
&& . /etc/os-release \
&& echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null \
&& apt-get update -q=2 \
&& apt-get install -y --no-install-recommends cmake=3.28.3* cmake-data=3.28.3* \
&& cmake --version
wget && \
pip3 install --upgrade "grpcio-tools<1.68" cmake==3.28.3

ENV CMAKE_POLICY_MINIMUM_REQUIRED=3.5

# Build expects "python" executable (not python3).
RUN rm -f /usr/bin/python && \
Expand Down Expand Up @@ -137,8 +128,7 @@ RUN cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
-DTRITON_ENABLE_JAVA_HTTP=ON \
-DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON \
-DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} /workspace/client
RUN make -j16 cc-clients java-clients && \
rm -fr ~/.m2
RUN cmake --build . -v --parallel --target cc-clients java-clients

# TODO: PA will rebuild the CC clients since it depends on it.
# This should be optimized so that we do not have to build
Expand All @@ -156,6 +146,7 @@ RUN if [ "$TRITON_PERF_ANALYZER_BUILD" = "1" ]; then \
-DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
-DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
-DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \
-DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \
-DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \
-DTRITON_ENABLE_PERF_ANALYZER_TFS=ON \
-DTRITON_ENABLE_PERF_ANALYZER_TS=ON \
Expand All @@ -167,7 +158,7 @@ RUN if [ "$TRITON_PERF_ANALYZER_BUILD" = "1" ]; then \
-DTRITON_PACKAGE_PERF_ANALYZER=ON \
-DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} \
/workspace/perf_analyzer && \
make -j16 perf-analyzer python-clients && \
cmake --build . -v --parallel --target perf-analyzer python-clients && \
pip3 install build && \
cd /workspace/perf_analyzer/genai-perf && \
python3 -m build --wheel --outdir /workspace/install/python; \
Expand All @@ -180,12 +171,13 @@ RUN if [ "$TRITON_PERF_ANALYZER_BUILD" = "1" ]; then \
-DTRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION} \
-DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
-DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \
-DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \
-DTRITON_ENABLE_PYTHON_HTTP=ON \
-DTRITON_ENABLE_PYTHON_GRPC=ON \
-DTRITON_PACKAGE_PERF_ANALYZER=ON \
-DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} \
/workspace/perf_analyzer && \
make -j16 python-clients && \
cmake --build . -v --parallel --target python-clients && \
mkdir -p /workspace/install/python && \
cp /workspace/perf_analyzer/genai_perf-*.whl /workspace/install/python/; \
fi
Expand Down Expand Up @@ -216,26 +208,27 @@ ARG TRITON_ENABLE_GPU

RUN apt-get update && \
apt-get install -y --no-install-recommends \
software-properties-common \
curl \
default-jdk \
git \
gperf \
libb64-dev \
libgoogle-perftools-dev \
libopencv-dev \
libopencv-core-dev \
libopencv-dev \
libssl-dev \
libtool \
maven \
perl \
python3 \
python3-pip \
python3-dev \
python3-wheel \
python3-pdfkit \
python3-pip \
python3-setuptools \
python3-wheel \
software-properties-common \
vim \
wget \
python3-pdfkit \
maven \
default-jdk && \
wget && \
pip3 install "grpcio<1.68" "grpcio-tools<1.68"

WORKDIR /workspace
Expand Down
2 changes: 1 addition & 1 deletion TRITON_VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.59.0
2.59.1
6 changes: 3 additions & 3 deletions build.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,9 @@
#

DEFAULT_TRITON_VERSION_MAP = {
"release_version": "2.59.0",
"triton_container_version": "25.06",
"upstream_container_version": "25.06",
"release_version": "2.59.1",
"triton_container_version": "25.07",
"upstream_container_version": "25.07",
"ort_version": "1.22.0",
"ort_openvino_version": "2025.1.0",
"standalone_openvino_version": "2025.1.0",
Expand Down
2 changes: 1 addition & 1 deletion deploy/aws/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
replicaCount: 1

image:
imageName: nvcr.io/nvidia/tritonserver:25.06-py3
imageName: nvcr.io/nvidia/tritonserver:25.07-py3
pullPolicy: IfNotPresent
modelRepositoryPath: s3://triton-inference-server-repository/model_repository
numGpus: 1
Expand Down
2 changes: 1 addition & 1 deletion deploy/fleetcommand/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

apiVersion: v1
# appVersion is the Triton version; update when changing release
appVersion: "2.59.0"
appVersion: "2.59.1"
description: Triton Inference Server (Fleet Command)
name: triton-inference-server
# version is the Chart version; update when changing anything in the chart
Expand Down
6 changes: 3 additions & 3 deletions deploy/fleetcommand/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
replicaCount: 1

image:
imageName: nvcr.io/nvidia/tritonserver:25.06-py3
imageName: nvcr.io/nvidia/tritonserver:25.07-py3
pullPolicy: IfNotPresent
numGpus: 1
serverCommand: tritonserver
Expand All @@ -47,13 +47,13 @@ image:
#
# To set model control mode, uncomment and configure below
# TODO: Fix the following url, it is invalid
# See https://github.com/triton-inference-server/server/blob/r25.06/docs/user_guide/model_management.md
# See https://github.com/triton-inference-server/server/blob/r25.07/docs/user_guide/model_management.md
# for more details
#- --model-control-mode=explicit|poll|none
#
# Additional server args
#
# see https://github.com/triton-inference-server/server/blob/r25.06/README.md
# see https://github.com/triton-inference-server/server/blob/r25.07/README.md
# for more details

service:
Expand Down
2 changes: 1 addition & 1 deletion deploy/gcp/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
replicaCount: 1

image:
imageName: nvcr.io/nvidia/tritonserver:25.06-py3
imageName: nvcr.io/nvidia/tritonserver:25.07-py3
pullPolicy: IfNotPresent
modelRepositoryPath: gs://triton-inference-server-repository/model_repository
numGpus: 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ metadata:
namespace: default
spec:
containers:
- image: nvcr.io/nvidia/tritonserver:25.06-py3-sdk
- image: nvcr.io/nvidia/tritonserver:25.07-py3-sdk
imagePullPolicy: Always
name: nv-triton-client
securityContext:
Expand Down
4 changes: 2 additions & 2 deletions deploy/gke-marketplace-app/server-deployer/build_and_push.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
export REGISTRY=gcr.io/$(gcloud config get-value project | tr ':' '/')
export APP_NAME=tritonserver
export MAJOR_VERSION=2.59
export MINOR_VERSION=2.59.0
export NGC_VERSION=25.06-py3
export MINOR_VERSION=2.59.1
export NGC_VERSION=25.07-py3

docker pull nvcr.io/nvidia/$APP_NAME:$NGC_VERSION

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ apiVersion: v1
appVersion: "2.59"
description: Triton Inference Server
name: triton-inference-server
version: 2.59.0
version: 2.59.1
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,14 @@ maxReplicaCount: 3
tritonProtocol: HTTP
# HPA GPU utilization autoscaling target
HPATargetAverageValue: 85
modelRepositoryPath: gs://triton_sample_models/25.06
publishedVersion: '2.59.0'
modelRepositoryPath: gs://triton_sample_models/25.07
publishedVersion: '2.59.1'
gcpMarketplace: true

image:
registry: gcr.io
repository: nvidia-ngc-public/tritonserver
tag: 25.06-py3
tag: 25.07-py3
pullPolicy: IfNotPresent
# modify the model repository here to match your GCP storage bucket
numGpus: 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
x-google-marketplace:
schemaVersion: v2
applicationApiVersion: v1beta1
publishedVersion: '2.59.0'
publishedVersion: '2.59.1'
publishedVersionMetadata:
releaseNote: >-
Initial release.
Expand Down
4 changes: 2 additions & 2 deletions deploy/gke-marketplace-app/server-deployer/schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
x-google-marketplace:
schemaVersion: v2
applicationApiVersion: v1beta1
publishedVersion: '2.59.0'
publishedVersion: '2.59.1'
publishedVersionMetadata:
releaseNote: >-
Initial release.
Expand Down Expand Up @@ -89,7 +89,7 @@ properties:
modelRepositoryPath:
type: string
title: Bucket where models are stored. Please make sure the user/service account to create the GKE app has permission to this GCS bucket. Read Triton documentation on configs and formatting details, supporting TensorRT, TensorFlow, Pytorch, Onnx ... etc.
default: gs://triton_sample_models/25.06
default: gs://triton_sample_models/25.07
image.ldPreloadPath:
type: string
title: Leave this empty by default. Triton allows users to create custom layers for backend such as TensorRT plugin or Tensorflow custom ops, the compiled shared library must be provided via LD_PRELOAD environment variable.
Expand Down
6 changes: 3 additions & 3 deletions deploy/gke-marketplace-app/trt-engine/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
```
docker run --gpus all -it --network host \
--shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \
-v ~:/scripts nvcr.io/nvidia/tensorrt:25.06-py3
-v ~:/scripts nvcr.io/nvidia/tensorrt:25.07-py3

pip install onnx six torch tf2onnx tensorflow

Expand All @@ -57,7 +57,7 @@ mkdir -p engines

python3 builder.py -m models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/model.ckpt -o engines/bert_large_int8_bs1_s128.engine -b 1 -s 128 -c models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/ -v models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt --int8 --fp16 --strict --calib-num 1 -iln -imh

gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/25.06/bert/1/model.plan
gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/25.07/bert/1/model.plan
```

For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/25.06/` should be updated accordingly with the correct version.
For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/25.07/` should be updated accordingly with the correct version.
2 changes: 1 addition & 1 deletion deploy/k8s-onprem/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ tags:
openshift: false

image:
imageName: nvcr.io/nvidia/tritonserver:25.06-py3
imageName: nvcr.io/nvidia/tritonserver:25.07-py3
pullPolicy: IfNotPresent
modelRepositoryServer: < Replace with the IP Address of your file server >
modelRepositoryPath: /srv/models
Expand Down
2 changes: 1 addition & 1 deletion deploy/oci/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
replicaCount: 1

image:
imageName: nvcr.io/nvidia/tritonserver:25.06-py3
imageName: nvcr.io/nvidia/tritonserver:25.07-py3
pullPolicy: IfNotPresent
modelRepositoryPath: s3://https://<OCI_NAMESPACE>.compat.objectstorage.<OCI_REGION>.oraclecloud.com:443/triton-inference-server-repository
numGpus: 1
Expand Down
1 change: 1 addition & 0 deletions docs/introduction/compatibility.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@

| Triton release version | NGC Tag | Python version | Torch version | TensorRT version | TensorRT-LLM version | CUDA version | CUDA Driver version | Size |
| --- | --- | --- | --- | --- | --- | --- | --- | --- |
| 25.07 | nvcr.io/nvidia/tritonserver:25.07-trtllm-python-py3 | Python 3.12.3 | 2.7.0a0+79aa17489c.nv25.4 | 10.10.0.31 | 0.20.0 | 12.9.0.036 | 575.51.03 | 18.3G |
| 25.06 | nvcr.io/nvidia/tritonserver:25.06-trtllm-python-py3 | Python 3.12.3 | 2.7.0a0+79aa17489c.nv25.4 | 10.10.0.31 | 0.20.0 | 12.9.0.036 | 575.51.03 | 18.3G |
| 25.05 | nvcr.io/nvidia/tritonserver:25.05-trtllm-python-py3 | Python 3.12.3 | 2.7.0a0+7c8ec84dab.nv25.3 | 10.9.0.34 | 0.19.0 | 12.8.1.012 | 570.124.06 | 17G |
| 25.04 | nvcr.io/nvidia/tritonserver:25.04-trtllm-python-py3 | Python 3.12.3 | 2.7.0a0+7c8ec84dab.nv25.3 | 10.9.0.34 | 0.18.2 | 12.8.1.012 | 570.124.06 | 17G |
Expand Down
2 changes: 1 addition & 1 deletion python/openai/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
docker run -it --net=host --gpus all --rm \
-v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-e HF_TOKEN \
nvcr.io/nvidia/tritonserver:25.06-vllm-python-py3
nvcr.io/nvidia/tritonserver:25.07-vllm-python-py3
```

2. Launch the OpenAI-compatible Triton Inference Server:
Expand Down
2 changes: 1 addition & 1 deletion qa/common/gen_jetson_trt_models
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
# Make all generated files accessible outside of container
umask 0000
# Set the version of the models
TRITON_VERSION=${TRITON_VERSION:=25.06}
TRITON_VERSION=${TRITON_VERSION:=25.07}
# Set the CUDA device to use
CUDA_DEVICE=${RUNNER_ID:=0}
# Set TensorRT image
Expand Down
2 changes: 1 addition & 1 deletion qa/common/gen_qa_custom_ops
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
##
############################################################################

TRITON_VERSION=${TRITON_VERSION:=25.06}
TRITON_VERSION=${TRITON_VERSION:=25.07}
NVIDIA_UPSTREAM_VERSION=${NVIDIA_UPSTREAM_VERSION:=$TRITON_VERSION}
PYTORCH_IMAGE=${PYTORCH_IMAGE:=nvcr.io/nvidia/pytorch:$NVIDIA_UPSTREAM_VERSION-py3}
UBUNTU_IMAGE=${UBUNTU_IMAGE:=ubuntu:24.04}
Expand Down
2 changes: 1 addition & 1 deletion qa/common/gen_qa_model_repository
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
##
############################################################################

TRITON_VERSION=${TRITON_VERSION:=25.06}
TRITON_VERSION=${TRITON_VERSION:=25.07}

# ONNX. Use ONNX_OPSET 0 to use the default for ONNX version
ONNX_VERSION=1.16.1
Expand Down
Loading