FEAT: support qwenvl2 vllm engine (#2428)

amumu96 · wuzhaoxin · web-flow · commit c0be11504c70 · 2024-10-12T18:28:41.000+08:00
Co-authored-by: wuzhaoxin &lt;15667065080@162.com&gt;
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -6909,18 +6909,15 @@
         "model_id":"Qwen/Qwen2-VL-72B-Instruct-GPTQ-{quantization}"
       }
     ],
-    "prompt_style":{
-      "style_name":"QWEN",
-      "system_prompt":"You are a helpful assistant",
-      "roles":[
-        "user",
-        "assistant"
-      ],
-      "stop": [
-        "<|im_end|>",
-        "<|endoftext|>"
-      ]
-    }
+    "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
+    "stop_token_ids": [
+      151645,
+      151643
+    ],
+    "stop": [
+      "<|im_end|>",
+      "<|endoftext|>"
+    ]
   },
   {
     "version": 1,
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
@@ -4627,14 +4627,15 @@
         "model_hub": "modelscope"
       }
     ],
-    "prompt_style": {
-      "style_name": "QWEN",
-      "system_prompt": "You are a helpful assistant",
-      "roles": [
-        "user",
-        "assistant"
-      ]
-    }
+    "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
+    "stop_token_ids": [
+      151645,
+      151643
+    ],
+    "stop": [
+      "<|im_end|>",
+      "<|endoftext|>"
+    ]
   },
   {
     "version": 1,
diff --git a/xinference/model/llm/transformers/cogvlm2.py b/xinference/model/llm/transformers/cogvlm2.py
@@ -29,7 +29,7 @@
     parse_messages,
 )
 from .core import PytorchChatModel, PytorchGenerateConfig
-from .utils import get_max_src_len
+from .utils import cache_clean, get_max_src_len
 
 logger = logging.getLogger(__name__)
 
@@ -176,6 +176,7 @@ def get_query_and_history(
             query = content
         return query, image, history
 
+    @cache_clean
     def chat(
         self,
         messages: List[Dict],
diff --git a/xinference/model/llm/transformers/cogvlm2_video.py b/xinference/model/llm/transformers/cogvlm2_video.py
@@ -28,6 +28,7 @@
     parse_messages,
 )
 from .core import PytorchChatModel, PytorchGenerateConfig
+from .utils import cache_clean
 
 logger = logging.getLogger(__name__)
 
@@ -227,6 +228,7 @@ def get_query_and_history(
 
         return query, image, video, history
 
+    @cache_clean
     def chat(
         self,
         messages: List[Dict],
diff --git a/xinference/model/llm/transformers/deepseek_vl.py b/xinference/model/llm/transformers/deepseek_vl.py
@@ -28,6 +28,7 @@
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from ..utils import generate_chat_completion, generate_completion_chunk
 from .core import PytorchChatModel, PytorchGenerateConfig
+from .utils import cache_clean
 
 logger = logging.getLogger(__name__)
 
@@ -137,6 +138,7 @@ def _fill_placeholder(_url, _index):
             return "".join(new_content), images
         return content, []
 
+    @cache_clean
     def chat(
         self,
         messages: List[Dict],
diff --git a/xinference/model/llm/transformers/glm4v.py b/xinference/model/llm/transformers/glm4v.py
@@ -26,7 +26,7 @@
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from ..utils import _decode_image, generate_chat_completion, generate_completion_chunk
 from .core import PytorchChatModel, PytorchGenerateConfig
-from .utils import get_max_src_len
+from .utils import cache_clean, get_max_src_len
 
 logger = logging.getLogger(__name__)
 
@@ -129,6 +129,7 @@ def _get_processed_msgs(messages: List[Dict]) -> List[Dict]:
                     res.append({"role": role, "content": text})
         return res
 
+    @cache_clean
     def chat(
         self,
         messages: List[Dict],
diff --git a/xinference/model/llm/transformers/intern_vl.py b/xinference/model/llm/transformers/intern_vl.py
@@ -27,6 +27,7 @@
     parse_messages,
 )
 from .core import PytorchChatModel, PytorchGenerateConfig
+from .utils import cache_clean
 
 logger = logging.getLogger(__name__)
 
@@ -326,6 +327,7 @@ def load(self, **kwargs):
             use_fast=False,
         )
 
+    @cache_clean
     def chat(
         self,
         messages: List[Dict],
diff --git a/xinference/model/llm/transformers/minicpmv25.py b/xinference/model/llm/transformers/minicpmv25.py
@@ -29,6 +29,7 @@
     parse_messages,
 )
 from .core import PytorchChatModel, PytorchGenerateConfig
+from .utils import cache_clean
 
 logger = logging.getLogger(__name__)
 
@@ -119,6 +120,7 @@ def _message_content_to_chat(self, content):
                 raise RuntimeError("Only one image per message is supported")
         return content, []
 
+    @cache_clean
     def chat(
         self,
         messages: List[Dict],
diff --git a/xinference/model/llm/transformers/minicpmv26.py b/xinference/model/llm/transformers/minicpmv26.py
@@ -30,6 +30,7 @@
     parse_messages,
 )
 from .core import PytorchChatModel, PytorchGenerateConfig
+from .utils import cache_clean
 
 logger = logging.getLogger(__name__)
 
@@ -198,6 +199,7 @@ def _convert_to_specific_style(self, messages: List[Dict]) -> Tuple:
         msgs.append({"role": "user", "content": images_chat + [content]})
         return msgs, video_existed
 
+    @cache_clean
     def chat(
         self,
         messages: List[Dict],
diff --git a/xinference/model/llm/transformers/omnilmm.py b/xinference/model/llm/transformers/omnilmm.py
@@ -24,6 +24,7 @@
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from ..utils import generate_chat_completion, parse_messages
 from .core import PytorchChatModel, PytorchGenerateConfig
+from .utils import cache_clean
 
 logger = logging.getLogger(__name__)
 
@@ -87,6 +88,7 @@ def _ensure_url(_url):
             return images, other_content
         return [], [{"type": "text", "text": content}]
 
+    @cache_clean
     def chat(
         self,
         messages: List[Dict],
diff --git a/xinference/model/llm/transformers/qwen2_audio.py b/xinference/model/llm/transformers/qwen2_audio.py
@@ -14,16 +14,22 @@
 import logging
 import uuid
 from io import BytesIO
-from typing import Dict, Iterator, List, Optional, Union
+from typing import Iterator, List, Optional, Union
 from urllib.request import urlopen
 
 import numpy as np
 
 from ....model.utils import select_device
-from ....types import ChatCompletion, ChatCompletionChunk, CompletionChunk
+from ....types import (
+    ChatCompletion,
+    ChatCompletionChunk,
+    ChatCompletionMessage,
+    CompletionChunk,
+)
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from ..utils import generate_chat_completion, generate_completion_chunk
 from .core import PytorchChatModel, PytorchGenerateConfig
+from .utils import cache_clean
 
 logger = logging.getLogger(__name__)
 
@@ -68,7 +74,7 @@ def load(self):
 
     def _transform_messages(
         self,
-        messages: List[Dict],
+        messages: List[ChatCompletionMessage],
     ):
         import librosa
 
@@ -89,9 +95,10 @@ def _transform_messages(
 
         return text, audios
 
+    @cache_clean
     def chat(
         self,
-        messages: List[Dict],
+        messages: List[ChatCompletionMessage],
         generate_config: Optional[PytorchGenerateConfig] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         text, audios = self._transform_messages(messages)
diff --git a/xinference/model/llm/transformers/qwen2_vl.py b/xinference/model/llm/transformers/qwen2_vl.py
@@ -27,6 +27,7 @@
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from ..utils import generate_chat_completion, generate_completion_chunk
 from .core import PytorchChatModel, PytorchGenerateConfig
+from .utils import cache_clean
 
 logger = logging.getLogger(__name__)
 
@@ -75,34 +76,7 @@ def load(self):
                 self.model_path, device_map=device, trust_remote_code=True
             ).eval()
 
-    def _transform_messages(
-        self,
-        messages: List[ChatCompletionMessage],
-    ):
-        transformed_messages = []
-        for msg in messages:
-            new_content = []
-            role = msg["role"]
-            content = msg["content"]
-            if isinstance(content, str):
-                new_content.append({"type": "text", "text": content})
-            elif isinstance(content, List):
-                for item in content:  # type: ignore
-                    if "text" in item:
-                        new_content.append({"type": "text", "text": item["text"]})
-                    elif "image_url" in item:
-                        new_content.append(
-                            {"type": "image", "image": item["image_url"]["url"]}
-                        )
-                    elif "video_url" in item:
-                        new_content.append(
-                            {"type": "video", "video": item["video_url"]["url"]}
-                        )
-            new_message = {"role": role, "content": new_content}
-            transformed_messages.append(new_message)
-
-        return transformed_messages
-
+    @cache_clean
     def chat(
         self,
         messages: List[ChatCompletionMessage],  # type: ignore
diff --git a/xinference/model/llm/transformers/qwen_vl.py b/xinference/model/llm/transformers/qwen_vl.py
@@ -28,7 +28,7 @@
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from ..utils import generate_chat_completion, generate_completion_chunk
 from .core import PytorchChatModel, PytorchGenerateConfig
-from .utils import pad_prefill_tokens
+from .utils import cache_clean, pad_prefill_tokens
 
 logger = logging.getLogger(__name__)
 
@@ -137,6 +137,7 @@ def _get_prompt_and_chat_history(self, messages: List[Dict]):
         prompt = self._message_content_to_qwen(messages[-1]["content"])
         return prompt, qwen_history
 
+    @cache_clean
     def chat(
         self,
         messages: List[Dict],
diff --git a/xinference/model/llm/transformers/utils.py b/xinference/model/llm/transformers/utils.py
@@ -11,7 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import asyncio
+import functools
 import gc
 import logging
 import os
@@ -777,3 +778,34 @@ def batch_inference_one_step(
         for r in req_list:
             r.stopped = True
             r.error_msg = str(e)
+
+
+def cache_clean(fn):
+    @functools.wraps(fn)
+    async def _async_wrapper(self, *args, **kwargs):
+        import gc
+
+        from ....device_utils import empty_cache
+
+        result = await fn(self, *args, **kwargs)
+
+        gc.collect()
+        empty_cache()
+        return result
+
+    @functools.wraps(fn)
+    def _wrapper(self, *args, **kwargs):
+        import gc
+
+        from ....device_utils import empty_cache
+
+        result = fn(self, *args, **kwargs)
+
+        gc.collect()
+        empty_cache()
+        return result
+
+    if asyncio.iscoroutinefunction(fn):
+        return _async_wrapper
+    else:
+        return _wrapper
diff --git a/xinference/model/llm/transformers/yi_vl.py b/xinference/model/llm/transformers/yi_vl.py
@@ -29,6 +29,7 @@
     parse_messages,
 )
 from .core import PytorchChatModel, PytorchGenerateConfig
+from .utils import cache_clean
 
 logger = logging.getLogger(__name__)
 
@@ -99,6 +100,7 @@ def _message_content_to_yi(content) -> Union[str, tuple]:
                 raise RuntimeError("Only one image per message is supported by Yi VL.")
         return content
 
+    @cache_clean
     def chat(
         self,
         messages: List[Dict],
diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
diff --git a/xinference/model/llm/vllm/utils.py b/xinference/model/llm/vllm/utils.py

Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,7 @@`
`27`	`27`	`parse_messages,`
`28`	`28`	`)`
`29`	`29`	`from .core import PytorchChatModel, PytorchGenerateConfig`
	`30`	`+from .utils import cache_clean`
`30`	`31`
`31`	`32`	`logger = logging.getLogger(__name__)`
`32`	`33`
`@@ -326,6 +327,7 @@ def load(self, **kwargs):`
`326`	`327`	`use_fast=False,`
`327`	`328`	`)`
`328`	`329`
	`330`	`+ @cache_clean`
`329`	`331`	`def chat(`
`330`	`332`	`self,`
`331`	`333`	`messages: List[Dict],`