diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index b17b3d17ec..f4d0fc51e2 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -4767,6 +4767,7 @@ { "model_format": "pytorch", "model_size_in_billions": 671, + "activated_size_in_billions": 37, "model_src": { "huggingface": { "quantizations": [ @@ -4846,6 +4847,7 @@ { "model_format": "pytorch", "model_size_in_billions": 671, + "activated_size_in_billions": 37, "model_src": { "huggingface": { "quantizations": [ @@ -4866,6 +4868,7 @@ { "model_format": "awq", "model_size_in_billions": 671, + "activated_size_in_billions": 37, "model_src": { "huggingface": { "quantizations": [ @@ -4885,6 +4888,7 @@ { "model_format": "ggufv2", "model_size_in_billions": 671, + "activated_size_in_billions": 37, "model_src": { "huggingface": { "quantizations": [ @@ -5215,6 +5219,7 @@ { "model_format": "mlx", "model_size_in_billions": 671, + "activated_size_in_billions": 37, "model_src": { "huggingface": { "quantizations": [ @@ -5263,6 +5268,7 @@ { "model_format": "pytorch", "model_size_in_billions": 671, + "activated_size_in_billions": 37, "model_src": { "huggingface": { "quantizations": [ @@ -5281,6 +5287,7 @@ { "model_format": "gptq", "model_size_in_billions": 671, + "activated_size_in_billions": 37, "model_src": { "huggingface": { "quantizations": [ @@ -5311,6 +5318,116 @@ "reasoning_start_tag": "", "reasoning_end_tag": "" }, + { + "version": 2, + "context_length": 131072, + "model_name": "Deepseek-V3.1", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat", + "reasoning", + "hybrid", + "tools" + ], + "model_description": "DeepSeek-V3.1 is a hybrid model that supports both thinking mode and non-thinking mode.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 671, + "activated_size_in_billions": 37, + "model_src": { + "huggingface": { + "quantizations": [ + "none" + ], + "model_id": "deepseek-ai/DeepSeek-V3.1" + }, + "modelscope": { + "quantizations": [ + "none" + ], + "model_id": "deepseek-ai/DeepSeek-V3.1" + } + } + }, + { + "model_format": "gptq", + "model_size_in_billions": 671, + "activated_size_in_billions": 37, + "model_src": { + "huggingface": { + "quantizations": [ + "Int4" + ], + "model_id": "cpatonn/DeepSeek-V3.1-GPTQ-4bit" + }, + "modelscope": { + "quantizations": [ + "Int4" + ], + "model_id": "cpatonn/DeepSeek-V3.1-GPTQ-4bit" + } + } + }, + { + "model_format": "awq", + "model_size_in_billions": 671, + "activated_size_in_billions": 37, + "model_src": { + "huggingface": { + "quantizations": [ + "Int4" + ], + "model_id": "QuantTrio/DeepSeek-V3.1-AWQ" + }, + "modelscope": { + "quantizations": [ + "Int4" + ], + "model_id": "tclf90/DeepSeek-V3.1-AWQ" + } + } + }, + { + "model_format": "mlx", + "model_size_in_billions": 671, + "activated_size_in_billions": 37, + "model_src": { + "huggingface": { + "quantizations": [ + "8bit", + "4bit" + ], + "model_id": "mlx-community/DeepSeek-V3.1-{quantization}" + }, + "modelscope": { + "quantizations": [ + "8bit", + "4bit" + ], + "model_id": "mlx-community/DeepSeek-V3.1-{quantization}" + } + } + } + ], + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if not thinking is defined %}{% set thinking = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, system_prompt='', is_first_sp=true, is_last_user=false) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{%- set ns.is_first = false -%}{%- set ns.is_last_user = true -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['tool_calls'] is defined and message['tool_calls'] is not none %}{%- if ns.is_last_user %}{{'<|Assistant|>'}}{%- endif %}{%- set ns.is_last_user = false -%}{%- set ns.is_first = false %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<|tool▁calls▁begin|><|tool▁call▁begin|>'+ tool['function']['name'] + '<|tool▁sep|>' + tool['function']['arguments'] + '<|tool▁call▁end|>'}}{%- else %}{{message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['function']['name'] + '<|tool▁sep|>' + tool['function']['arguments'] + '<|tool▁call▁end|>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'<|tool▁call▁begin|>'+ tool['function']['name'] + '<|tool▁sep|>' + tool['function']['arguments'] + '<|tool▁call▁end|>'}}{%- endif %}{%- endfor %}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- if message['role'] == 'assistant' and (message['tool_calls'] is not defined or message['tool_calls'] is none) %}{%- if ns.is_last_user %}{{'<|Assistant|>'}}{%- if message['prefix'] is defined and message['prefix'] and thinking %}{{''}} {%- else %}{{''}}{%- endif %}{%- endif %}{%- set ns.is_last_user = false -%}{%- if ns.is_tool %}{{message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{%- set content = message['content'] -%}{%- if '' in content %}{%- set content = content.split('', 1)[1] -%}{%- endif %}{{content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_last_user = false -%}{%- set ns.is_tool = true -%}{{'<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endfor -%}{%- if add_generation_prompt and ns.is_last_user and not ns.is_tool %}{{'<|Assistant|>'}}{%- if not thinking %}{{''}}{%- else %}{{''}}{%- endif %}{% endif %}", + "stop_token_ids": [ + 1 + ], + "stop": [ + "<|end▁of▁sentence|>" + ], + "reasoning_start_tag": "", + "reasoning_end_tag": "", + "virtualenv": { + "packages": [ + "transformers==4.53.0" + ] + } + }, { "version": 2, "context_length": 131072, @@ -6242,6 +6359,7 @@ { "model_format": "pytorch", "model_size_in_billions": 671, + "activated_size_in_billions": 37, "model_src": { "huggingface": { "quantizations": [ @@ -6262,6 +6380,7 @@ { "model_format": "awq", "model_size_in_billions": 671, + "activated_size_in_billions": 37, "model_src": { "huggingface": { "quantizations": [ @@ -6281,6 +6400,7 @@ { "model_format": "ggufv2", "model_size_in_billions": 671, + "activated_size_in_billions": 37, "model_src": { "huggingface": { "quantizations": [ @@ -6475,6 +6595,7 @@ { "model_format": "mlx", "model_size_in_billions": 671, + "activated_size_in_billions": 37, "model_src": { "huggingface": { "quantizations": [ @@ -6517,6 +6638,7 @@ { "model_format": "pytorch", "model_size_in_billions": 671, + "activated_size_in_billions": 37, "model_src": { "huggingface": { "quantizations": [ @@ -6535,6 +6657,7 @@ { "model_format": "awq", "model_size_in_billions": 671, + "activated_size_in_billions": 37, "model_src": { "huggingface": { "quantizations": [ @@ -6553,6 +6676,7 @@ { "model_format": "mlx", "model_size_in_billions": 671, + "activated_size_in_billions": 37, "model_src": { "huggingface": { "quantizations": [ diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py index d6e8dd0efb..15bf3e07d8 100644 --- a/xinference/model/llm/utils.py +++ b/xinference/model/llm/utils.py @@ -82,7 +82,7 @@ "HuatuoGPT-o1-LLaMA-3.1", ] -DEEPSEEK_TOOL_CALL_FAMILY = ["deepseek-v3", "deepseek-r1-0528"] +DEEPSEEK_TOOL_CALL_FAMILY = ["deepseek-v3", "deepseek-r1-0528", "Deepseek-V3.1"] TOOL_CALL_FAMILY = ( QWEN_TOOL_CALL_FAMILY diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index c531d34972..df9411c8cd 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -273,6 +273,7 @@ class VLLMGenerateConfig(TypedDict, total=False): VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Instruct") VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Thinking") VLLM_SUPPORTED_CHAT_MODELS.append("Qwen3-Coder") + VLLM_SUPPORTED_CHAT_MODELS.append("Deepseek-V3.1") if VLLM_INSTALLED and VLLM_VERSION >= version.parse("0.10.0"): VLLM_SUPPORTED_CHAT_MODELS.append("glm-4.5")