Skip to content

vllm

vLLM: a high-throughput and memory-efficient inference engine for LLMs

Modules:

Name Description
adapter_commons
assets
attention
beam_search
benchmarks
collect_env
compilation
config
connections
core
device_allocator
distributed
engine
entrypoints
env_override
envs
executor
forward_context
inputs
logger

Logging configuration for vLLM.

logging_utils
logits_process
lora
model_executor
multimodal
outputs
platforms
plugins
pooling_params
profiler
ray
reasoning
sampling_params

Sampling parameters for text generation.

scalar_type
scripts
sequence

Sequence and its related classes.

tasks
test_utils
third_party
tracing
transformers_utils
triton_utils
usage
utils
v1
version
worker

MODULE_ATTRS module-attribute

MODULE_ATTRS = {
    "AsyncEngineArgs": ".engine.arg_utils:AsyncEngineArgs",
    "EngineArgs": ".engine.arg_utils:EngineArgs",
    "AsyncLLMEngine": ".engine.async_llm_engine:AsyncLLMEngine",
    "LLMEngine": ".engine.llm_engine:LLMEngine",
    "LLM": ".entrypoints.llm:LLM",
    "initialize_ray_cluster": ".executor.ray_utils:initialize_ray_cluster",
    "PromptType": ".inputs:PromptType",
    "TextPrompt": ".inputs:TextPrompt",
    "TokensPrompt": ".inputs:TokensPrompt",
    "ModelRegistry": ".model_executor.models:ModelRegistry",
    "SamplingParams": ".sampling_params:SamplingParams",
    "PoolingParams": ".pooling_params:PoolingParams",
    "ClassificationOutput": ".outputs:ClassificationOutput",
    "ClassificationRequestOutput": ".outputs:ClassificationRequestOutput",
    "CompletionOutput": ".outputs:CompletionOutput",
    "EmbeddingOutput": ".outputs:EmbeddingOutput",
    "EmbeddingRequestOutput": ".outputs:EmbeddingRequestOutput",
    "PoolingOutput": ".outputs:PoolingOutput",
    "PoolingRequestOutput": ".outputs:PoolingRequestOutput",
    "RequestOutput": ".outputs:RequestOutput",
    "ScoringOutput": ".outputs:ScoringOutput",
    "ScoringRequestOutput": ".outputs:ScoringRequestOutput",
}

ModelRegistry module-attribute

ModelRegistry = _ModelRegistry(
    {
        model_arch: (
            _LazyRegisteredModel(
                module_name=f"vllm.model_executor.models.{mod_relname}",
                class_name=cls_name,
            )
        )
        for (model_arch, (mod_relname, cls_name)) in (
            items()
        )
    }
)

PromptType module-attribute

Set of possible schemas for an LLM input, including both decoder-only and encoder/decoder input types:

__all__ module-attribute

__all__ = [
    "__version__",
    "__version_tuple__",
    "LLM",
    "ModelRegistry",
    "PromptType",
    "TextPrompt",
    "TokensPrompt",
    "SamplingParams",
    "RequestOutput",
    "CompletionOutput",
    "PoolingOutput",
    "PoolingRequestOutput",
    "EmbeddingOutput",
    "EmbeddingRequestOutput",
    "ClassificationOutput",
    "ClassificationRequestOutput",
    "ScoringOutput",
    "ScoringRequestOutput",
    "LLMEngine",
    "EngineArgs",
    "AsyncLLMEngine",
    "AsyncEngineArgs",
    "initialize_ray_cluster",
    "PoolingParams",
]

AsyncEngineArgs dataclass

Bases: EngineArgs

Arguments for asynchronous vLLM engine.

Source code in vllm/engine/arg_utils.py
@dataclass
class AsyncEngineArgs(EngineArgs):
    """Arguments for asynchronous vLLM engine."""
    enable_log_requests: bool = False

    @property
    @deprecated(
        "`disable_log_requests` is deprecated and has been replaced with "
        "`enable_log_requests`. This will be removed in v0.12.0. Please use "
        "`enable_log_requests` instead.")
    def disable_log_requests(self) -> bool:
        return not self.enable_log_requests

    @disable_log_requests.setter
    @deprecated(
        "`disable_log_requests` is deprecated and has been replaced with "
        "`enable_log_requests`. This will be removed in v0.12.0. Please use "
        "`enable_log_requests` instead.")
    def disable_log_requests(self, value: bool):
        self.enable_log_requests = not value

    @staticmethod
    def add_cli_args(parser: FlexibleArgumentParser,
                     async_args_only: bool = False) -> FlexibleArgumentParser:
        # Initialize plugin to update the parser, for example, The plugin may
        # add a new kind of quantization method to --quantization argument or
        # a new device to --device argument.
        load_general_plugins()
        if not async_args_only:
            parser = EngineArgs.add_cli_args(parser)
        parser.add_argument('--enable-log-requests',
                            action=argparse.BooleanOptionalAction,
                            default=AsyncEngineArgs.enable_log_requests,
                            help='Enable logging requests.')
        parser.add_argument('--disable-log-requests',
                            action=argparse.BooleanOptionalAction,
                            default=not AsyncEngineArgs.enable_log_requests,
                            help='[DEPRECATED] Disable logging requests.',
                            deprecated=True)
        current_platform.pre_register_and_update(parser)
        return parser

disable_log_requests property writable

disable_log_requests: bool

enable_log_requests class-attribute instance-attribute

enable_log_requests: bool = False

__init__

__init__(
    model: str = model,
    served_model_name: Optional[
        Union[str, List[str]]
    ] = served_model_name,
    tokenizer: Optional[str] = tokenizer,
    hf_config_path: Optional[str] = hf_config_path,
    runner: RunnerOption = runner,
    convert: ConvertOption = convert,
    task: Optional[TaskOption] = task,
    skip_tokenizer_init: bool = skip_tokenizer_init,
    enable_prompt_embeds: bool = enable_prompt_embeds,
    tokenizer_mode: TokenizerMode = tokenizer_mode,
    trust_remote_code: bool = trust_remote_code,
    allowed_local_media_path: str = allowed_local_media_path,
    download_dir: Optional[str] = download_dir,
    load_format: Union[str, LoadFormats] = load_format,
    config_format: str = config_format,
    dtype: ModelDType = dtype,
    kv_cache_dtype: CacheDType = cache_dtype,
    seed: Optional[int] = seed,
    max_model_len: Optional[int] = max_model_len,
    cuda_graph_sizes: list[int] = get_field(
        SchedulerConfig, "cuda_graph_sizes"
    ),
    distributed_executor_backend: Optional[
        Union[
            str,
            DistributedExecutorBackend,
            Type[ExecutorBase],
        ]
    ] = distributed_executor_backend,
    pipeline_parallel_size: int = pipeline_parallel_size,
    tensor_parallel_size: int = tensor_parallel_size,
    data_parallel_size: int = data_parallel_size,
    data_parallel_rank: Optional[int] = None,
    data_parallel_start_rank: Optional[int] = None,
    data_parallel_size_local: Optional[int] = None,
    data_parallel_address: Optional[str] = None,
    data_parallel_rpc_port: Optional[int] = None,
    data_parallel_hybrid_lb: bool = False,
    data_parallel_backend: str = data_parallel_backend,
    enable_expert_parallel: bool = enable_expert_parallel,
    eplb_config: EPLBConfig = get_field(
        ParallelConfig, "eplb_config"
    ),
    enable_eplb: bool = enable_eplb,
    num_redundant_experts: int = num_redundant_experts,
    eplb_window_size: int = window_size,
    eplb_step_interval: int = step_interval,
    eplb_log_balancedness: bool = log_balancedness,
    max_parallel_loading_workers: Optional[
        int
    ] = max_parallel_loading_workers,
    block_size: Optional[BlockSize] = block_size,
    enable_prefix_caching: Optional[
        bool
    ] = enable_prefix_caching,
    prefix_caching_hash_algo: PrefixCachingHashAlgo = prefix_caching_hash_algo,
    disable_sliding_window: bool = disable_sliding_window,
    disable_cascade_attn: bool = disable_cascade_attn,
    swap_space: float = swap_space,
    cpu_offload_gb: float = cpu_offload_gb,
    gpu_memory_utilization: float = gpu_memory_utilization,
    max_num_batched_tokens: Optional[
        int
    ] = max_num_batched_tokens,
    max_num_partial_prefills: int = max_num_partial_prefills,
    max_long_partial_prefills: int = max_long_partial_prefills,
    long_prefill_token_threshold: int = long_prefill_token_threshold,
    max_num_seqs: Optional[int] = max_num_seqs,
    max_logprobs: int = max_logprobs,
    logprobs_mode: LogprobsMode = logprobs_mode,
    disable_log_stats: bool = False,
    revision: Optional[str] = revision,
    code_revision: Optional[str] = code_revision,
    rope_scaling: dict[str, Any] = get_field(
        ModelConfig, "rope_scaling"
    ),
    rope_theta: Optional[float] = rope_theta,
    hf_token: Optional[Union[bool, str]] = hf_token,
    hf_overrides: HfOverrides = get_field(
        ModelConfig, "hf_overrides"
    ),
    tokenizer_revision: Optional[str] = tokenizer_revision,
    quantization: Optional[
        QuantizationMethods
    ] = quantization,
    enforce_eager: bool = enforce_eager,
    max_seq_len_to_capture: int = max_seq_len_to_capture,
    disable_custom_all_reduce: bool = disable_custom_all_reduce,
    limit_mm_per_prompt: dict[str, int] = get_field(
        MultiModalConfig, "limit_per_prompt"
    ),
    interleave_mm_strings: bool = interleave_mm_strings,
    media_io_kwargs: dict[str, dict[str, Any]] = get_field(
        MultiModalConfig, "media_io_kwargs"
    ),
    mm_processor_kwargs: Optional[
        Dict[str, Any]
    ] = mm_processor_kwargs,
    disable_mm_preprocessor_cache: bool = False,
    mm_processor_cache_gb: int = mm_processor_cache_gb,
    mm_encoder_tp_mode: MMEncoderTPMode = mm_encoder_tp_mode,
    skip_mm_profiling: bool = skip_mm_profiling,
    enable_lora: bool = False,
    enable_lora_bias: bool = bias_enabled,
    max_loras: int = max_loras,
    max_lora_rank: int = max_lora_rank,
    default_mm_loras: Optional[
        Dict[str, str]
    ] = default_mm_loras,
    fully_sharded_loras: bool = fully_sharded_loras,
    max_cpu_loras: Optional[int] = max_cpu_loras,
    lora_dtype: Optional[Union[str, dtype]] = lora_dtype,
    lora_extra_vocab_size: int = lora_extra_vocab_size,
    ray_workers_use_nsight: bool = ray_workers_use_nsight,
    num_gpu_blocks_override: Optional[
        int
    ] = num_gpu_blocks_override,
    num_lookahead_slots: int = num_lookahead_slots,
    model_loader_extra_config: dict = get_field(
        LoadConfig, "model_loader_extra_config"
    ),
    ignore_patterns: Optional[
        Union[str, List[str]]
    ] = ignore_patterns,
    preemption_mode: Optional[str] = preemption_mode,
    scheduler_delay_factor: float = delay_factor,
    enable_chunked_prefill: Optional[
        bool
    ] = enable_chunked_prefill,
    disable_chunked_mm_input: bool = disable_chunked_mm_input,
    disable_hybrid_kv_cache_manager: bool = disable_hybrid_kv_cache_manager,
    guided_decoding_backend: GuidedDecodingBackend = backend,
    guided_decoding_disable_fallback: bool = disable_fallback,
    guided_decoding_disable_any_whitespace: bool = disable_any_whitespace,
    guided_decoding_disable_additional_properties: bool = disable_additional_properties,
    logits_processor_pattern: Optional[
        str
    ] = logits_processor_pattern,
    speculative_config: Optional[Dict[str, Any]] = None,
    show_hidden_metrics_for_version: Optional[
        str
    ] = show_hidden_metrics_for_version,
    otlp_traces_endpoint: Optional[
        str
    ] = otlp_traces_endpoint,
    collect_detailed_traces: Optional[
        list[DetailedTraceModules]
    ] = collect_detailed_traces,
    disable_async_output_proc: bool = not use_async_output_proc,
    scheduling_policy: SchedulerPolicy = policy,
    scheduler_cls: Union[str, Type[object]] = scheduler_cls,
    override_neuron_config: dict[str, Any] = get_field(
        ModelConfig, "override_neuron_config"
    ),
    override_pooler_config: Optional[
        Union[dict, PoolerConfig]
    ] = override_pooler_config,
    compilation_config: CompilationConfig = get_field(
        VllmConfig, "compilation_config"
    ),
    worker_cls: str = worker_cls,
    worker_extension_cls: str = worker_extension_cls,
    kv_transfer_config: Optional[KVTransferConfig] = None,
    kv_events_config: Optional[KVEventsConfig] = None,
    generation_config: str = generation_config,
    enable_sleep_mode: bool = enable_sleep_mode,
    override_generation_config: dict[str, Any] = get_field(
        ModelConfig, "override_generation_config"
    ),
    model_impl: str = model_impl,
    override_attention_dtype: str = override_attention_dtype,
    calculate_kv_scales: bool = calculate_kv_scales,
    mamba_cache_dtype: MambaDType = mamba_cache_dtype,
    mamba_ssm_cache_dtype: MambaDType = mamba_ssm_cache_dtype,
    additional_config: dict[str, Any] = get_field(
        VllmConfig, "additional_config"
    ),
    reasoning_parser: str = reasoning_backend,
    use_tqdm_on_load: bool = use_tqdm_on_load,
    pt_load_map_location: str = pt_load_map_location,
    enable_multimodal_encoder_data_parallel: bool = False,
    logits_processors: Optional[
        list[Union[str, type[LogitsProcessor]]]
    ] = logits_processors,
    async_scheduling: bool = async_scheduling,
    kv_sharing_fast_prefill: bool = kv_sharing_fast_prefill,
    enable_log_requests: bool = False,
) -> None

add_cli_args staticmethod

add_cli_args(
    parser: FlexibleArgumentParser,
    async_args_only: bool = False,
) -> FlexibleArgumentParser
Source code in vllm/engine/arg_utils.py
@staticmethod
def add_cli_args(parser: FlexibleArgumentParser,
                 async_args_only: bool = False) -> FlexibleArgumentParser:
    # Initialize plugin to update the parser, for example, The plugin may
    # add a new kind of quantization method to --quantization argument or
    # a new device to --device argument.
    load_general_plugins()
    if not async_args_only:
        parser = EngineArgs.add_cli_args(parser)
    parser.add_argument('--enable-log-requests',
                        action=argparse.BooleanOptionalAction,
                        default=AsyncEngineArgs.enable_log_requests,
                        help='Enable logging requests.')
    parser.add_argument('--disable-log-requests',
                        action=argparse.BooleanOptionalAction,
                        default=not AsyncEngineArgs.enable_log_requests,
                        help='[DEPRECATED] Disable logging requests.',
                        deprecated=True)
    current_platform.pre_register_and_update(parser)
    return parser

AsyncLLMEngine

Bases: EngineClient

An asynchronous wrapper for LLMEngine.

This class is used to wrap the LLMEngine class to make it asynchronous. It uses asyncio to create a background loop that keeps processing incoming requests. The LLMEngine is kicked by the generate method when there are requests in the waiting queue. The generate method yields the outputs from the LLMEngine to the caller.

Parameters:

Name Type Description Default
log_requests bool

Whether to log the requests.

True
start_engine_loop bool

If True, the background task to run the engine will be automatically started in the generate call.

True
*args

Arguments for LLMEngine.

()
**kwargs

Arguments for LLMEngine.

{}
Source code in vllm/engine/async_llm_engine.py
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
class AsyncLLMEngine(EngineClient):
    """An asynchronous wrapper for [`LLMEngine`][vllm.LLMEngine].

    This class is used to wrap the [`LLMEngine`][vllm.LLMEngine] class to
    make it asynchronous. It uses asyncio to create a background loop that keeps
    processing incoming requests. The [`LLMEngine`][vllm.LLMEngine] is kicked
    by the generate method when there are requests in the waiting queue. The
    generate method yields the outputs from the [`LLMEngine`][vllm.LLMEngine]
    to the caller.

    Args:
        log_requests: Whether to log the requests.
        start_engine_loop: If True, the background task to run the engine
            will be automatically started in the generate call.
        *args: Arguments for [`LLMEngine`][vllm.LLMEngine].
        **kwargs: Arguments for [`LLMEngine`][vllm.LLMEngine].
    """

    _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine

    def __init__(self,
                 *args,
                 log_requests: bool = True,
                 start_engine_loop: bool = True,
                 **kwargs) -> None:
        if envs.VLLM_USE_V1:
            raise ValueError(
                "Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. "
                "This should not happen. As a workaround, try using "
                "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
                "VLLM_USE_V1=0 or 1 and report this issue on Github.")

        self.log_requests = log_requests
        self.engine = self._engine_class(*args, **kwargs)

        # This ensures quick processing of request outputs
        # so the append to asyncio queues is not delayed,
        # especially for multi-step.
        self.use_process_request_outputs_callback = (
            self.engine.model_config.use_async_output_proc)

        if self.use_process_request_outputs_callback:
            self.engine.process_request_outputs_callback = \
                weak_bind(self.process_request_outputs)

        self.background_loop: Optional[asyncio.Future] = None
        # We need to keep a reference to unshielded
        # task as well to prevent it from being garbage
        # collected
        self._background_loop_unshielded: Optional[asyncio.Task] = None
        self.start_engine_loop = start_engine_loop
        self._errored_with: Optional[BaseException] = None

        # Lazy initialized fields
        self._request_tracker: RequestTracker

    def __del__(self):
        if rt := getattr(self, "request_tracker", None):
            # Wake up engine loop so that it will exit cleanly
            rt.new_requests_event.set()

    @classmethod
    def _get_executor_cls(cls,
                          engine_config: VllmConfig) -> Type[ExecutorBase]:
        return LLMEngine._get_executor_cls(engine_config)

    @classmethod
    @deprecate_kwargs(
        "disable_log_requests",
        additional_message=("This argument will have no effect. "
                            "Use `enable_log_requests` instead."),
    )
    def from_vllm_config(
            cls,
            vllm_config: VllmConfig,
            start_engine_loop: bool = True,
            usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
            stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
            enable_log_requests: bool = False,
            disable_log_stats: bool = False,
            disable_log_requests: bool = True,  # Deprecated, will be removed
    ) -> "AsyncLLMEngine":
        """Create an AsyncLLMEngine from the EngineArgs."""

        return cls(
            vllm_config=vllm_config,
            executor_class=cls._get_executor_cls(vllm_config),
            start_engine_loop=start_engine_loop,
            log_requests=enable_log_requests,
            log_stats=not disable_log_stats,
            usage_context=usage_context,
            stat_loggers=stat_loggers,
        )

    @classmethod
    def from_engine_args(
        cls,
        engine_args: AsyncEngineArgs,
        start_engine_loop: bool = True,
        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
    ) -> "AsyncLLMEngine":
        """Creates an async LLM engine from the engine arguments."""

        vllm_config = engine_args.create_engine_config(usage_context)

        async_engine_cls = cls
        if envs.VLLM_USE_V1:
            from vllm.v1.engine.async_llm import AsyncLLM as V1AsyncLLMEngine
            async_engine_cls = V1AsyncLLMEngine

        return async_engine_cls.from_vllm_config(
            vllm_config=vllm_config,
            start_engine_loop=start_engine_loop,
            usage_context=usage_context,
            stat_loggers=stat_loggers,
            disable_log_stats=engine_args.disable_log_stats,
            enable_log_requests=engine_args.enable_log_requests,
        )

    @property
    def is_running(self) -> bool:
        return (self.background_loop is not None
                and self._background_loop_unshielded is not None
                and not self._background_loop_unshielded.done())

    @property
    def is_stopped(self) -> bool:
        return self.errored or (self.background_loop is not None and
                                self._background_loop_unshielded is not None
                                and self._background_loop_unshielded.done())

    @property
    def errored(self) -> bool:
        return self._errored_with is not None

    @property
    def dead_error(self) -> BaseException:
        return AsyncEngineDeadError(
            "Background loop is not running. If it was running, "
            "inspect the output to find the stacktrace of the "
            "error that caused the background loop to stop "
            "(AsyncEngineDeadError).")

    def set_errored(self, exc: Exception) -> None:
        self._errored_with = exc

    def _error_callback(self, exc: Exception) -> None:
        self.set_errored(exc)
        self._request_tracker.propagate_exception(exc)

    async def get_input_preprocessor(self) -> InputPreprocessor:
        return self.engine.input_preprocessor

    async def get_tokenizer(
        self,
        lora_request: Optional[LoRARequest] = None,
    ) -> AnyTokenizer:
        return await self.engine.get_tokenizer_async(lora_request)

    def start_background_loop(self) -> None:
        """Start the background loop."""
        if self.errored:
            raise AsyncEngineDeadError(
                "Background loop has errored already.") from self._errored_with
        if self.is_running:
            raise RuntimeError("Background loop is already running.")
        # Initialize the RequestTracker here so it uses the right event loop.
        self._request_tracker = RequestTracker()

        self._background_loop_unshielded = asyncio.get_event_loop(
        ).create_task(self.run_engine_loop(weakref.ref(self)))
        self._background_loop_unshielded.add_done_callback(
            partial(_log_task_completion, error_callback=self._error_callback))
        self.background_loop = asyncio.shield(self._background_loop_unshielded)

    def shutdown_background_loop(self) -> None:
        """
        Shut down the background loop.

        This method needs to be called during cleanup to remove
        references to `self` and properly GC the resources held
        by the async LLM engine (e.g., the executors as well as
        their resources).
        """
        if self._background_loop_unshielded is not None:
            self._background_loop_unshielded.cancel()
            self._background_loop_unshielded = None
        self.background_loop = None

    async def engine_step(self, virtual_engine: int) -> bool:
        """Kick the engine to process the waiting requests.

        Returns True if there are in-progress requests."""

        new_requests, aborted_requests = (
            self._request_tracker.get_new_and_aborted_requests())

        for new_request in new_requests:
            # Add the request into the vLLM engine's waiting queue.
            try:
                await self.engine.add_request_async(**new_request)
            except ValueError as e:
                # TODO: use a vLLM specific error for failed validation
                self._request_tracker.process_exception(
                    new_request["request_id"],
                    e,
                    verbose=self.log_requests,
                )

        if aborted_requests:
            await self._engine_abort(aborted_requests)

        request_outputs = await self.engine.step_async(virtual_engine)

        # Put the outputs into the corresponding streams.
        # If used as a callback, then already invoked inside
        # LLMEngine's _process_model_outputs
        if not self.use_process_request_outputs_callback:
            all_finished = self.process_request_outputs(request_outputs)
        else:
            # For callback case, we only need to detect when all
            # requests are finished
            all_finished = all(request_output.finished
                               for request_output in request_outputs)

        return not all_finished

    def process_request_outputs(self, request_outputs) -> bool:
        # Put the outputs into the corresponding streams.
        all_finished = True
        for request_output in request_outputs:
            self._request_tracker.process_request_output(
                request_output, verbose=self.log_requests)
            all_finished = all_finished and request_output.finished

        return all_finished

    async def _engine_abort(self, request_ids: Iterable[str]):
        self.engine.abort_request(request_ids)

    @staticmethod
    async def run_engine_loop(engine_ref: ReferenceType):
        """We use a weakref to the engine so that the running loop
        doesn't prevent the engine being garbage collected."""
        engine: Optional[AsyncLLMEngine] = engine_ref()
        if not engine:
            return

        pipeline_parallel_size = \
                engine.engine.parallel_config.pipeline_parallel_size
        has_requests_in_progress = [False] * pipeline_parallel_size
        while True:
            if not any(has_requests_in_progress):
                logger.debug("Waiting for new requests...")
                # Stop the execute model loop in parallel workers until there
                # are more requests to process. This avoids waiting
                # indefinitely in torch.distributed ops which may otherwise
                # timeout, and unblocks the RPC thread in the workers so that
                # they can process any other queued control plane messages,
                # such as add/remove lora adapters.
                await engine.engine.stop_remote_worker_execution_loop_async()
                request_tracker = engine._request_tracker
                # Allow engine to be garbage collected while
                # waiting for new requests
                del engine
                await asyncio.sleep(0)
                if engine_ref() is None:
                    return
                await request_tracker.wait_for_new_requests()
                engine = engine_ref()
                if not engine:
                    return
                logger.debug("Got new requests!")
                requests_in_progress = [
                    asyncio.create_task(engine.engine_step(ve))
                    for ve in range(pipeline_parallel_size)
                ]
                has_requests_in_progress = [True] * pipeline_parallel_size

            # Abort if iteration takes too long due to unrecoverable errors
            # (eg. NCCL timeouts).
            try:
                async with asyncio_timeout(ENGINE_ITERATION_TIMEOUT_S):
                    done, _ = await asyncio.wait(
                        requests_in_progress,
                        return_when=asyncio.FIRST_COMPLETED)
                    for _ in range(pipeline_parallel_size):
                        await asyncio.sleep(0)
                for task in done:
                    result = task.result()
                    virtual_engine = requests_in_progress.index(task)
                    has_unfinished_requests = (
                        engine.engine.
                        has_unfinished_requests_for_virtual_engine(
                            virtual_engine))
                    if result or has_unfinished_requests:
                        requests_in_progress[virtual_engine] = (
                            asyncio.create_task(
                                engine.engine_step(virtual_engine)))
                        has_requests_in_progress[virtual_engine] = True
                    else:
                        has_requests_in_progress[virtual_engine] = False
            except asyncio.TimeoutError as exc:
                logger.error(
                    "Engine iteration timed out. This should never happen!")
                engine.set_errored(exc)
                raise
            await asyncio.sleep(0)

    async def add_request(
        self,
        request_id: str,
        prompt: PromptType,
        params: Union[SamplingParams, PoolingParams],
        arrival_time: Optional[float] = None,
        lora_request: Optional[LoRARequest] = None,
        trace_headers: Optional[Mapping[str, str]] = None,
        priority: int = 0,
        data_parallel_rank: Optional[int] = None,
        tokenization_kwargs: Optional[dict[str, Any]] = None,
    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
        if not self.is_running:
            if self.start_engine_loop:
                self.start_background_loop()
            else:
                raise AsyncEngineDeadError(
                    "Background loop is not running. If it was running, "
                    "inspect the output to find the stacktrace of the "
                    "error that caused the background loop to stop "
                    "(AsyncEngineDeadError).")

        if (priority != 0
                and not self.engine.scheduler_config.policy == "priority"):
            raise ValueError(f"Got priority {priority} but "
                             "Priority scheduling is not enabled.")

        stream = self._request_tracker.add_request(
            request_id,
            verbose=self.log_requests,
            prompt=prompt,
            params=params,
            arrival_time=arrival_time or time.time(),
            lora_request=lora_request,
            trace_headers=trace_headers,
            priority=priority,
            data_parallel_rank=data_parallel_rank,
            tokenization_kwargs=tokenization_kwargs,
        )

        return stream.generator()

    async def generate(
        self,
        prompt: PromptType,
        sampling_params: SamplingParams,
        request_id: str,
        lora_request: Optional[LoRARequest] = None,
        trace_headers: Optional[Mapping[str, str]] = None,
        priority: int = 0,
        data_parallel_rank: Optional[int] = None,
    ) -> AsyncGenerator[RequestOutput, None]:
        """Generate outputs for a request.

        Generate outputs for a request. This method is a coroutine. It adds the
        request into the waiting queue of the LLMEngine and streams the outputs
        from the LLMEngine to the caller.

        Args:
            prompt: The prompt to the LLM. See
                [`PromptType`][vllm.inputs.PromptType] for more details about
                the format of each input.
            sampling_params: The sampling parameters of the request.
            request_id: The unique id of the request.
            lora_request: LoRA request to use for generation, if any.
            trace_headers: OpenTelemetry trace headers.
            priority: The priority of the request.
                Only applicable with priority scheduling.
            data_parallel_rank: The (global) data parallel rank that must
                handle this request. Only applicable if DP is enabled.
        Yields:
            The output `RequestOutput` objects from the LLMEngine
            for the request.

        Details:
            - If the engine is not running, start the background loop,
              which iteratively invokes
              [`engine_step`][vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step]
              to process the waiting requests.
            - Add the request to the engine's `RequestTracker`.
              On the next background loop, this request will be sent to
              the underlying engine.
              Also, a corresponding `AsyncStream` will be created.
            - Wait for the request outputs from `AsyncStream` and yield them.

        Example:
            >>> # Please refer to entrypoints/api_server.py for
            >>> # the complete example.
            >>>
            >>> # initialize the engine and the example input
            >>> # note that engine_args here is AsyncEngineArgs instance
            >>> engine = AsyncLLMEngine.from_engine_args(engine_args)
            >>> example_input = {
            >>>     "prompt": "What is LLM?",
            >>>     "stream": False, # assume the non-streaming case
            >>>     "temperature": 0.0,
            >>>     "request_id": 0,
            >>> }
            >>>
            >>> # start the generation
            >>> results_generator = engine.generate(
            >>>    example_input["prompt"],
            >>>    SamplingParams(temperature=example_input["temperature"]),
            >>>    example_input["request_id"])
            >>>
            >>> # get the results
            >>> final_output = None
            >>> async for request_output in results_generator:
            >>>     if await request.is_disconnected():
            >>>         # Abort the request if the client disconnects.
            >>>         await engine.abort(request_id)
            >>>         # Return or raise an error
            >>>         ...
            >>>     final_output = request_output
            >>>
            >>> # Process and return the final output
            >>> ...
        """
        try:
            async for output in await self.add_request(
                    request_id,
                    prompt,
                    sampling_params,
                    lora_request=lora_request,
                    trace_headers=trace_headers,
                    priority=priority,
                    data_parallel_rank=data_parallel_rank,
            ):
                yield LLMEngine.validate_output(output, RequestOutput)
        except asyncio.CancelledError:
            await self.abort(request_id)
            raise

    async def encode(
        self,
        prompt: PromptType,
        pooling_params: PoolingParams,
        request_id: str,
        lora_request: Optional[LoRARequest] = None,
        trace_headers: Optional[Mapping[str, str]] = None,
        priority: int = 0,
        tokenization_kwargs: Optional[dict[str, Any]] = None,
    ) -> AsyncGenerator[PoolingRequestOutput, None]:
        """Generate outputs for a request from a pooling model.

        Generate outputs for a request. This method is a coroutine. It adds the
        request into the waiting queue of the LLMEngine and streams the outputs
        from the LLMEngine to the caller.

        Args:
            prompt: The prompt to the LLM. See
                [`PromptType`][vllm.inputs.PromptType] for more details about
                the format of each input.
            pooling_params: The pooling parameters of the request.
            request_id: The unique id of the request.
            lora_request: LoRA request to use for generation, if any.
            trace_headers: OpenTelemetry trace headers.
            priority: The priority of the request.
                Only applicable with priority scheduling.

        Yields:
            The output `PoolingRequestOutput` objects from the LLMEngine
            for the request.

        Details:
            - If the engine is not running, start the background loop,
                which iteratively invokes
                [`vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`][]
                to process the waiting requests.
            - Add the request to the engine's `RequestTracker`.
                On the next background loop, this request will be sent to
                the underlying engine.
                Also, a corresponding `AsyncStream` will be created.
            - Wait for the request outputs from `AsyncStream` and yield them.

        Example:
        ```
        # Please refer to entrypoints/api_server.py for
        # the complete example.

        # initialize the engine and the example input
        # note that engine_args here is AsyncEngineArgs instance
        engine = AsyncLLMEngine.from_engine_args(engine_args)
        example_input = {
            "input": "What is LLM?",
            "request_id": 0,
        }

        # start the generation
        results_generator = engine.encode(
        example_input["input"],
        PoolingParams(),
        example_input["request_id"])

        # get the results
        final_output = None
        async for request_output in results_generator:
            if await request.is_disconnected():
                # Abort the request if the client disconnects.
                await engine.abort(request_id)
                # Return or raise an error
                ...
            final_output = request_output

        # Process and return the final output
        ...
        ```
        """
        try:
            async for output in await self.add_request(
                    request_id,
                    prompt,
                    pooling_params,
                    lora_request=lora_request,
                    trace_headers=trace_headers,
                    priority=priority,
                    tokenization_kwargs=tokenization_kwargs,
            ):
                yield LLMEngine.validate_output(output, PoolingRequestOutput)
        except asyncio.CancelledError:
            await self.abort(request_id)
            raise

    async def abort(self, request_id: Union[str, Iterable[str]]) -> None:
        """Abort a request.

        Abort a submitted request. If the request is finished or not found,
        this method will be a no-op.

        Args:
            request_id: The unique id of the request.
        """
        if not isinstance(request_id, str):
            raise RuntimeError("Only single-request abort supported in"
                               " deprecated V0")
        if not self.is_running:
            raise AsyncEngineDeadError(
                "Background loop is not running. If it was running, "
                "inspect the output to find the stacktrace of the "
                "error that caused the background loop to stop "
                "(AsyncEngineDeadError).")

        return self._abort(request_id)

    def _abort(self, request_id: str) -> None:
        """Abort a request.

        Abort a submitted request. If the request is finished or not found,
        this method will be a no-op.

        Args:
            request_id: The unique id of the request.
        """
        self._request_tracker.abort_request(request_id,
                                            exception=asyncio.CancelledError,
                                            verbose=self.log_requests)

    async def get_vllm_config(self) -> VllmConfig:
        """Get the vllm configuration of the vLLM engine."""
        return self.engine.get_vllm_config()

    async def get_model_config(self) -> ModelConfig:
        """Get the model configuration of the vLLM engine."""
        return self.engine.get_model_config()

    async def get_parallel_config(self) -> ParallelConfig:
        """Get the parallel configuration of the vLLM engine."""
        return self.engine.get_parallel_config()

    async def get_decoding_config(self) -> DecodingConfig:
        """Get the decoding configuration of the vLLM engine."""
        return self.engine.get_decoding_config()

    async def get_scheduler_config(self) -> SchedulerConfig:
        """Get the scheduling configuration of the vLLM engine."""
        return self.engine.get_scheduler_config()

    async def get_lora_config(self) -> LoRAConfig:
        """Get the lora configuration of the vLLM engine."""
        return self.engine.get_lora_config()

    async def do_log_stats(
            self,
            scheduler_outputs: Optional[SchedulerOutputs] = None,
            model_output: Optional[List[SamplerOutput]] = None) -> None:
        self.engine.do_log_stats()

    async def check_health(self) -> None:
        """Raises an error if engine is unhealthy."""
        t = time.perf_counter()
        logger.debug("Starting health check...")
        if self.is_stopped:
            raise AsyncEngineDeadError("Background loop is stopped.")

        await self.engine.check_health_async()
        logger.debug("Health check took %fs", time.perf_counter() - t)

    async def is_tracing_enabled(self) -> bool:
        return self.engine.is_tracing_enabled()

    def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
        self.engine.add_logger(logger_name=logger_name, logger=logger)

    def remove_logger(self, logger_name: str) -> None:
        self.engine.remove_logger(logger_name=logger_name)

    async def start_profile(self) -> None:
        self.engine.start_profile()

    async def stop_profile(self) -> None:
        self.engine.stop_profile()

    async def reset_mm_cache(self) -> None:
        self.engine.reset_mm_cache()

    async def reset_prefix_cache(self,
                                 device: Optional[Device] = None) -> None:
        self.engine.reset_prefix_cache(device)

    async def sleep(self, level: int = 1) -> None:
        await self.reset_prefix_cache()
        self.engine.sleep(level)

    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
        self.engine.wake_up(tags)

    async def is_sleeping(self) -> bool:
        return self.engine.is_sleeping()

    async def add_lora(self, lora_request: LoRARequest) -> None:
        self.engine.add_lora(lora_request)

    async def collective_rpc(self,
                             method: str,
                             timeout: Optional[float] = None,
                             args: tuple = (),
                             kwargs: Optional[dict] = None):
        """
        Perform a collective RPC call to the given path.
        """
        return await self.engine.collective_rpc_async(method, timeout, args,
                                                      kwargs)

_background_loop_unshielded instance-attribute

_background_loop_unshielded: Optional[Task] = None

_engine_class class-attribute instance-attribute

_errored_with instance-attribute

_errored_with: Optional[BaseException] = None

_request_tracker instance-attribute

_request_tracker: RequestTracker

background_loop instance-attribute

background_loop: Optional[Future] = None

dead_error property

dead_error: BaseException

engine instance-attribute

engine = _engine_class(*args, **kwargs)

errored property

errored: bool

is_running property

is_running: bool

is_stopped property

is_stopped: bool

log_requests instance-attribute

log_requests = log_requests

start_engine_loop instance-attribute

start_engine_loop = start_engine_loop

use_process_request_outputs_callback instance-attribute

use_process_request_outputs_callback = use_async_output_proc

__del__

__del__()
Source code in vllm/engine/async_llm_engine.py
def __del__(self):
    if rt := getattr(self, "request_tracker", None):
        # Wake up engine loop so that it will exit cleanly
        rt.new_requests_event.set()

__init__

__init__(
    *args,
    log_requests: bool = True,
    start_engine_loop: bool = True,
    **kwargs,
) -> None
Source code in vllm/engine/async_llm_engine.py
def __init__(self,
             *args,
             log_requests: bool = True,
             start_engine_loop: bool = True,
             **kwargs) -> None:
    if envs.VLLM_USE_V1:
        raise ValueError(
            "Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. "
            "This should not happen. As a workaround, try using "
            "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
            "VLLM_USE_V1=0 or 1 and report this issue on Github.")

    self.log_requests = log_requests
    self.engine = self._engine_class(*args, **kwargs)

    # This ensures quick processing of request outputs
    # so the append to asyncio queues is not delayed,
    # especially for multi-step.
    self.use_process_request_outputs_callback = (
        self.engine.model_config.use_async_output_proc)

    if self.use_process_request_outputs_callback:
        self.engine.process_request_outputs_callback = \
            weak_bind(self.process_request_outputs)

    self.background_loop: Optional[asyncio.Future] = None
    # We need to keep a reference to unshielded
    # task as well to prevent it from being garbage
    # collected
    self._background_loop_unshielded: Optional[asyncio.Task] = None
    self.start_engine_loop = start_engine_loop
    self._errored_with: Optional[BaseException] = None

    # Lazy initialized fields
    self._request_tracker: RequestTracker

_abort

_abort(request_id: str) -> None

Abort a request.

Abort a submitted request. If the request is finished or not found, this method will be a no-op.

Parameters:

Name Type Description Default
request_id str

The unique id of the request.

required
Source code in vllm/engine/async_llm_engine.py
def _abort(self, request_id: str) -> None:
    """Abort a request.

    Abort a submitted request. If the request is finished or not found,
    this method will be a no-op.

    Args:
        request_id: The unique id of the request.
    """
    self._request_tracker.abort_request(request_id,
                                        exception=asyncio.CancelledError,
                                        verbose=self.log_requests)

_engine_abort async

_engine_abort(request_ids: Iterable[str])
Source code in vllm/engine/async_llm_engine.py
async def _engine_abort(self, request_ids: Iterable[str]):
    self.engine.abort_request(request_ids)

_error_callback

_error_callback(exc: Exception) -> None
Source code in vllm/engine/async_llm_engine.py
def _error_callback(self, exc: Exception) -> None:
    self.set_errored(exc)
    self._request_tracker.propagate_exception(exc)

_get_executor_cls classmethod

_get_executor_cls(
    engine_config: VllmConfig,
) -> Type[ExecutorBase]
Source code in vllm/engine/async_llm_engine.py
@classmethod
def _get_executor_cls(cls,
                      engine_config: VllmConfig) -> Type[ExecutorBase]:
    return LLMEngine._get_executor_cls(engine_config)

abort async

abort(request_id: Union[str, Iterable[str]]) -> None

Abort a request.

Abort a submitted request. If the request is finished or not found, this method will be a no-op.

Parameters:

Name Type Description Default
request_id Union[str, Iterable[str]]

The unique id of the request.

required
Source code in vllm/engine/async_llm_engine.py
async def abort(self, request_id: Union[str, Iterable[str]]) -> None:
    """Abort a request.

    Abort a submitted request. If the request is finished or not found,
    this method will be a no-op.

    Args:
        request_id: The unique id of the request.
    """
    if not isinstance(request_id, str):
        raise RuntimeError("Only single-request abort supported in"
                           " deprecated V0")
    if not self.is_running:
        raise AsyncEngineDeadError(
            "Background loop is not running. If it was running, "
            "inspect the output to find the stacktrace of the "
            "error that caused the background loop to stop "
            "(AsyncEngineDeadError).")

    return self._abort(request_id)

add_logger

add_logger(
    logger_name: str, logger: StatLoggerBase
) -> None
Source code in vllm/engine/async_llm_engine.py
def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
    self.engine.add_logger(logger_name=logger_name, logger=logger)

add_lora async

add_lora(lora_request: LoRARequest) -> None
Source code in vllm/engine/async_llm_engine.py
async def add_lora(self, lora_request: LoRARequest) -> None:
    self.engine.add_lora(lora_request)

add_request async

add_request(
    request_id: str,
    prompt: PromptType,
    params: Union[SamplingParams, PoolingParams],
    arrival_time: Optional[float] = None,
    lora_request: Optional[LoRARequest] = None,
    trace_headers: Optional[Mapping[str, str]] = None,
    priority: int = 0,
    data_parallel_rank: Optional[int] = None,
    tokenization_kwargs: Optional[dict[str, Any]] = None,
) -> AsyncGenerator[
    Union[RequestOutput, PoolingRequestOutput], None
]
Source code in vllm/engine/async_llm_engine.py
async def add_request(
    self,
    request_id: str,
    prompt: PromptType,
    params: Union[SamplingParams, PoolingParams],
    arrival_time: Optional[float] = None,
    lora_request: Optional[LoRARequest] = None,
    trace_headers: Optional[Mapping[str, str]] = None,
    priority: int = 0,
    data_parallel_rank: Optional[int] = None,
    tokenization_kwargs: Optional[dict[str, Any]] = None,
) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
    if not self.is_running:
        if self.start_engine_loop:
            self.start_background_loop()
        else:
            raise AsyncEngineDeadError(
                "Background loop is not running. If it was running, "
                "inspect the output to find the stacktrace of the "
                "error that caused the background loop to stop "
                "(AsyncEngineDeadError).")

    if (priority != 0
            and not self.engine.scheduler_config.policy == "priority"):
        raise ValueError(f"Got priority {priority} but "
                         "Priority scheduling is not enabled.")

    stream = self._request_tracker.add_request(
        request_id,
        verbose=self.log_requests,
        prompt=prompt,
        params=params,
        arrival_time=arrival_time or time.time(),
        lora_request=lora_request,
        trace_headers=trace_headers,
        priority=priority,
        data_parallel_rank=data_parallel_rank,
        tokenization_kwargs=tokenization_kwargs,
    )

    return stream.generator()

check_health async

check_health() -> None

Raises an error if engine is unhealthy.

Source code in vllm/engine/async_llm_engine.py
async def check_health(self) -> None:
    """Raises an error if engine is unhealthy."""
    t = time.perf_counter()
    logger.debug("Starting health check...")
    if self.is_stopped:
        raise AsyncEngineDeadError("Background loop is stopped.")

    await self.engine.check_health_async()
    logger.debug("Health check took %fs", time.perf_counter() - t)

collective_rpc async

collective_rpc(
    method: str,
    timeout: Optional[float] = None,
    args: tuple = (),
    kwargs: Optional[dict] = None,
)

Perform a collective RPC call to the given path.

Source code in vllm/engine/async_llm_engine.py
async def collective_rpc(self,
                         method: str,
                         timeout: Optional[float] = None,
                         args: tuple = (),
                         kwargs: Optional[dict] = None):
    """
    Perform a collective RPC call to the given path.
    """
    return await self.engine.collective_rpc_async(method, timeout, args,
                                                  kwargs)

do_log_stats async

do_log_stats(
    scheduler_outputs: Optional[SchedulerOutputs] = None,
    model_output: Optional[List[SamplerOutput]] = None,
) -> None
Source code in vllm/engine/async_llm_engine.py
async def do_log_stats(
        self,
        scheduler_outputs: Optional[SchedulerOutputs] = None,
        model_output: Optional[List[SamplerOutput]] = None) -> None:
    self.engine.do_log_stats()

encode async

encode(
    prompt: PromptType,
    pooling_params: PoolingParams,
    request_id: str,
    lora_request: Optional[LoRARequest] = None,
    trace_headers: Optional[Mapping[str, str]] = None,
    priority: int = 0,
    tokenization_kwargs: Optional[dict[str, Any]] = None,
) -> AsyncGenerator[PoolingRequestOutput, None]

Generate outputs for a request from a pooling model.

Generate outputs for a request. This method is a coroutine. It adds the request into the waiting queue of the LLMEngine and streams the outputs from the LLMEngine to the caller.

Parameters:

Name Type Description Default
prompt PromptType

The prompt to the LLM. See PromptType for more details about the format of each input.

required
pooling_params PoolingParams

The pooling parameters of the request.

required
request_id str

The unique id of the request.

required
lora_request Optional[LoRARequest]

LoRA request to use for generation, if any.

None
trace_headers Optional[Mapping[str, str]]

OpenTelemetry trace headers.

None
priority int

The priority of the request. Only applicable with priority scheduling.

0

Yields:

Type Description
AsyncGenerator[PoolingRequestOutput, None]

The output PoolingRequestOutput objects from the LLMEngine

AsyncGenerator[PoolingRequestOutput, None]

for the request.

Details
  • If the engine is not running, start the background loop, which iteratively invokes vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step to process the waiting requests.
  • Add the request to the engine's RequestTracker. On the next background loop, this request will be sent to the underlying engine. Also, a corresponding AsyncStream will be created.
  • Wait for the request outputs from AsyncStream and yield them.

Example:

# Please refer to entrypoints/api_server.py for
# the complete example.

# initialize the engine and the example input
# note that engine_args here is AsyncEngineArgs instance
engine = AsyncLLMEngine.from_engine_args(engine_args)
example_input = {
    "input": "What is LLM?",
    "request_id": 0,
}

# start the generation
results_generator = engine.encode(
example_input["input"],
PoolingParams(),
example_input["request_id"])

# get the results
final_output = None
async for request_output in results_generator:
    if await request.is_disconnected():
        # Abort the request if the client disconnects.
        await engine.abort(request_id)
        # Return or raise an error
        ...
    final_output = request_output

# Process and return the final output
...

Source code in vllm/engine/async_llm_engine.py
async def encode(
    self,
    prompt: PromptType,
    pooling_params: PoolingParams,
    request_id: str,
    lora_request: Optional[LoRARequest] = None,
    trace_headers: Optional[Mapping[str, str]] = None,
    priority: int = 0,
    tokenization_kwargs: Optional[dict[str, Any]] = None,
) -> AsyncGenerator[PoolingRequestOutput, None]:
    """Generate outputs for a request from a pooling model.

    Generate outputs for a request. This method is a coroutine. It adds the
    request into the waiting queue of the LLMEngine and streams the outputs
    from the LLMEngine to the caller.

    Args:
        prompt: The prompt to the LLM. See
            [`PromptType`][vllm.inputs.PromptType] for more details about
            the format of each input.
        pooling_params: The pooling parameters of the request.
        request_id: The unique id of the request.
        lora_request: LoRA request to use for generation, if any.
        trace_headers: OpenTelemetry trace headers.
        priority: The priority of the request.
            Only applicable with priority scheduling.

    Yields:
        The output `PoolingRequestOutput` objects from the LLMEngine
        for the request.

    Details:
        - If the engine is not running, start the background loop,
            which iteratively invokes
            [`vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`][]
            to process the waiting requests.
        - Add the request to the engine's `RequestTracker`.
            On the next background loop, this request will be sent to
            the underlying engine.
            Also, a corresponding `AsyncStream` will be created.
        - Wait for the request outputs from `AsyncStream` and yield them.

    Example:
    ```
    # Please refer to entrypoints/api_server.py for
    # the complete example.

    # initialize the engine and the example input
    # note that engine_args here is AsyncEngineArgs instance
    engine = AsyncLLMEngine.from_engine_args(engine_args)
    example_input = {
        "input": "What is LLM?",
        "request_id": 0,
    }

    # start the generation
    results_generator = engine.encode(
    example_input["input"],
    PoolingParams(),
    example_input["request_id"])

    # get the results
    final_output = None
    async for request_output in results_generator:
        if await request.is_disconnected():
            # Abort the request if the client disconnects.
            await engine.abort(request_id)
            # Return or raise an error
            ...
        final_output = request_output

    # Process and return the final output
    ...
    ```
    """
    try:
        async for output in await self.add_request(
                request_id,
                prompt,
                pooling_params,
                lora_request=lora_request,
                trace_headers=trace_headers,
                priority=priority,
                tokenization_kwargs=tokenization_kwargs,
        ):
            yield LLMEngine.validate_output(output, PoolingRequestOutput)
    except asyncio.CancelledError:
        await self.abort(request_id)
        raise

engine_step async

engine_step(virtual_engine: int) -> bool

Kick the engine to process the waiting requests.

Returns True if there are in-progress requests.

Source code in vllm/engine/async_llm_engine.py
async def engine_step(self, virtual_engine: int) -> bool:
    """Kick the engine to process the waiting requests.

    Returns True if there are in-progress requests."""

    new_requests, aborted_requests = (
        self._request_tracker.get_new_and_aborted_requests())

    for new_request in new_requests:
        # Add the request into the vLLM engine's waiting queue.
        try:
            await self.engine.add_request_async(**new_request)
        except ValueError as e:
            # TODO: use a vLLM specific error for failed validation
            self._request_tracker.process_exception(
                new_request["request_id"],
                e,
                verbose=self.log_requests,
            )

    if aborted_requests:
        await self._engine_abort(aborted_requests)

    request_outputs = await self.engine.step_async(virtual_engine)

    # Put the outputs into the corresponding streams.
    # If used as a callback, then already invoked inside
    # LLMEngine's _process_model_outputs
    if not self.use_process_request_outputs_callback:
        all_finished = self.process_request_outputs(request_outputs)
    else:
        # For callback case, we only need to detect when all
        # requests are finished
        all_finished = all(request_output.finished
                           for request_output in request_outputs)

    return not all_finished

from_engine_args classmethod

from_engine_args(
    engine_args: AsyncEngineArgs,
    start_engine_loop: bool = True,
    usage_context: UsageContext = ENGINE_CONTEXT,
    stat_loggers: Optional[
        Dict[str, StatLoggerBase]
    ] = None,
) -> AsyncLLMEngine

Creates an async LLM engine from the engine arguments.

Source code in vllm/engine/async_llm_engine.py
@classmethod
def from_engine_args(
    cls,
    engine_args: AsyncEngineArgs,
    start_engine_loop: bool = True,
    usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
    stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
) -> "AsyncLLMEngine":
    """Creates an async LLM engine from the engine arguments."""

    vllm_config = engine_args.create_engine_config(usage_context)

    async_engine_cls = cls
    if envs.VLLM_USE_V1:
        from vllm.v1.engine.async_llm import AsyncLLM as V1AsyncLLMEngine
        async_engine_cls = V1AsyncLLMEngine

    return async_engine_cls.from_vllm_config(
        vllm_config=vllm_config,
        start_engine_loop=start_engine_loop,
        usage_context=usage_context,
        stat_loggers=stat_loggers,
        disable_log_stats=engine_args.disable_log_stats,
        enable_log_requests=engine_args.enable_log_requests,
    )

from_vllm_config classmethod

from_vllm_config(
    vllm_config: VllmConfig,
    start_engine_loop: bool = True,
    usage_context: UsageContext = ENGINE_CONTEXT,
    stat_loggers: Optional[
        dict[str, StatLoggerBase]
    ] = None,
    enable_log_requests: bool = False,
    disable_log_stats: bool = False,
    disable_log_requests: bool = True,
) -> AsyncLLMEngine

Create an AsyncLLMEngine from the EngineArgs.

Source code in vllm/engine/async_llm_engine.py
@classmethod
@deprecate_kwargs(
    "disable_log_requests",
    additional_message=("This argument will have no effect. "
                        "Use `enable_log_requests` instead."),
)
def from_vllm_config(
        cls,
        vllm_config: VllmConfig,
        start_engine_loop: bool = True,
        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
        stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
        enable_log_requests: bool = False,
        disable_log_stats: bool = False,
        disable_log_requests: bool = True,  # Deprecated, will be removed
) -> "AsyncLLMEngine":
    """Create an AsyncLLMEngine from the EngineArgs."""

    return cls(
        vllm_config=vllm_config,
        executor_class=cls._get_executor_cls(vllm_config),
        start_engine_loop=start_engine_loop,
        log_requests=enable_log_requests,
        log_stats=not disable_log_stats,
        usage_context=usage_context,
        stat_loggers=stat_loggers,
    )

generate async

generate(
    prompt: PromptType,
    sampling_params: SamplingParams,
    request_id: str,
    lora_request: Optional[LoRARequest] = None,
    trace_headers: Optional[Mapping[str, str]] = None,
    priority: int = 0,
    data_parallel_rank: Optional[int] = None,
) -> AsyncGenerator[RequestOutput, None]

Generate outputs for a request.

Generate outputs for a request. This method is a coroutine. It adds the request into the waiting queue of the LLMEngine and streams the outputs from the LLMEngine to the caller.

Parameters:

Name Type Description Default
prompt PromptType

The prompt to the LLM. See PromptType for more details about the format of each input.

required
sampling_params SamplingParams

The sampling parameters of the request.

required
request_id str

The unique id of the request.

required
lora_request Optional[LoRARequest]

LoRA request to use for generation, if any.

None
trace_headers Optional[Mapping[str, str]]

OpenTelemetry trace headers.

None
priority int

The priority of the request. Only applicable with priority scheduling.

0
data_parallel_rank Optional[int]

The (global) data parallel rank that must handle this request. Only applicable if DP is enabled.

None

Yields: The output RequestOutput objects from the LLMEngine for the request.

Details
  • If the engine is not running, start the background loop, which iteratively invokes engine_step to process the waiting requests.
  • Add the request to the engine's RequestTracker. On the next background loop, this request will be sent to the underlying engine. Also, a corresponding AsyncStream will be created.
  • Wait for the request outputs from AsyncStream and yield them.
Example

Please refer to entrypoints/api_server.py for

the complete example.

initialize the engine and the example input

note that engine_args here is AsyncEngineArgs instance

engine = AsyncLLMEngine.from_engine_args(engine_args) example_input = { "prompt": "What is LLM?", "stream": False, # assume the non-streaming case "temperature": 0.0, "request_id": 0, }

start the generation

results_generator = engine.generate( example_input["prompt"], SamplingParams(temperature=example_input["temperature"]), example_input["request_id"])

get the results

final_output = None async for request_output in results_generator: if await request.is_disconnected(): # Abort the request if the client disconnects. await engine.abort(request_id) # Return or raise an error ... final_output = request_output

Process and return the final output

...

Source code in vllm/engine/async_llm_engine.py
async def generate(
    self,
    prompt: PromptType,
    sampling_params: SamplingParams,
    request_id: str,
    lora_request: Optional[LoRARequest] = None,
    trace_headers: Optional[Mapping[str, str]] = None,
    priority: int = 0,
    data_parallel_rank: Optional[int] = None,
) -> AsyncGenerator[RequestOutput, None]:
    """Generate outputs for a request.

    Generate outputs for a request. This method is a coroutine. It adds the
    request into the waiting queue of the LLMEngine and streams the outputs
    from the LLMEngine to the caller.

    Args:
        prompt: The prompt to the LLM. See
            [`PromptType`][vllm.inputs.PromptType] for more details about
            the format of each input.
        sampling_params: The sampling parameters of the request.
        request_id: The unique id of the request.
        lora_request: LoRA request to use for generation, if any.
        trace_headers: OpenTelemetry trace headers.
        priority: The priority of the request.
            Only applicable with priority scheduling.
        data_parallel_rank: The (global) data parallel rank that must
            handle this request. Only applicable if DP is enabled.
    Yields:
        The output `RequestOutput` objects from the LLMEngine
        for the request.

    Details:
        - If the engine is not running, start the background loop,
          which iteratively invokes
          [`engine_step`][vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step]
          to process the waiting requests.
        - Add the request to the engine's `RequestTracker`.
          On the next background loop, this request will be sent to
          the underlying engine.
          Also, a corresponding `AsyncStream` will be created.
        - Wait for the request outputs from `AsyncStream` and yield them.

    Example:
        >>> # Please refer to entrypoints/api_server.py for
        >>> # the complete example.
        >>>
        >>> # initialize the engine and the example input
        >>> # note that engine_args here is AsyncEngineArgs instance
        >>> engine = AsyncLLMEngine.from_engine_args(engine_args)
        >>> example_input = {
        >>>     "prompt": "What is LLM?",
        >>>     "stream": False, # assume the non-streaming case
        >>>     "temperature": 0.0,
        >>>     "request_id": 0,
        >>> }
        >>>
        >>> # start the generation
        >>> results_generator = engine.generate(
        >>>    example_input["prompt"],
        >>>    SamplingParams(temperature=example_input["temperature"]),
        >>>    example_input["request_id"])
        >>>
        >>> # get the results
        >>> final_output = None
        >>> async for request_output in results_generator:
        >>>     if await request.is_disconnected():
        >>>         # Abort the request if the client disconnects.
        >>>         await engine.abort(request_id)
        >>>         # Return or raise an error
        >>>         ...
        >>>     final_output = request_output
        >>>
        >>> # Process and return the final output
        >>> ...
    """
    try:
        async for output in await self.add_request(
                request_id,
                prompt,
                sampling_params,
                lora_request=lora_request,
                trace_headers=trace_headers,
                priority=priority,
                data_parallel_rank=data_parallel_rank,
        ):
            yield LLMEngine.validate_output(output, RequestOutput)
    except asyncio.CancelledError:
        await self.abort(request_id)
        raise

get_decoding_config async

get_decoding_config() -> DecodingConfig

Get the decoding configuration of the vLLM engine.

Source code in vllm/engine/async_llm_engine.py
async def get_decoding_config(self) -> DecodingConfig:
    """Get the decoding configuration of the vLLM engine."""
    return self.engine.get_decoding_config()

get_input_preprocessor async

get_input_preprocessor() -> InputPreprocessor
Source code in vllm/engine/async_llm_engine.py
async def get_input_preprocessor(self) -> InputPreprocessor:
    return self.engine.input_preprocessor

get_lora_config async

get_lora_config() -> LoRAConfig

Get the lora configuration of the vLLM engine.

Source code in vllm/engine/async_llm_engine.py
async def get_lora_config(self) -> LoRAConfig:
    """Get the lora configuration of the vLLM engine."""
    return self.engine.get_lora_config()

get_model_config async

get_model_config() -> ModelConfig

Get the model configuration of the vLLM engine.

Source code in vllm/engine/async_llm_engine.py
async def get_model_config(self) -> ModelConfig:
    """Get the model configuration of the vLLM engine."""
    return self.engine.get_model_config()

get_parallel_config async

get_parallel_config() -> ParallelConfig

Get the parallel configuration of the vLLM engine.

Source code in vllm/engine/async_llm_engine.py
async def get_parallel_config(self) -> ParallelConfig:
    """Get the parallel configuration of the vLLM engine."""
    return self.engine.get_parallel_config()

get_scheduler_config async

get_scheduler_config() -> SchedulerConfig

Get the scheduling configuration of the vLLM engine.

Source code in vllm/engine/async_llm_engine.py
async def get_scheduler_config(self) -> SchedulerConfig:
    """Get the scheduling configuration of the vLLM engine."""
    return self.engine.get_scheduler_config()

get_tokenizer async

get_tokenizer(
    lora_request: Optional[LoRARequest] = None,
) -> AnyTokenizer
Source code in vllm/engine/async_llm_engine.py
async def get_tokenizer(
    self,
    lora_request: Optional[LoRARequest] = None,
) -> AnyTokenizer:
    return await self.engine.get_tokenizer_async(lora_request)

get_vllm_config async

get_vllm_config() -> VllmConfig

Get the vllm configuration of the vLLM engine.

Source code in vllm/engine/async_llm_engine.py
async def get_vllm_config(self) -> VllmConfig:
    """Get the vllm configuration of the vLLM engine."""
    return self.engine.get_vllm_config()

is_sleeping async

is_sleeping() -> bool
Source code in vllm/engine/async_llm_engine.py
async def is_sleeping(self) -> bool:
    return self.engine.is_sleeping()

is_tracing_enabled async

is_tracing_enabled() -> bool
Source code in vllm/engine/async_llm_engine.py
async def is_tracing_enabled(self) -> bool:
    return self.engine.is_tracing_enabled()

process_request_outputs

process_request_outputs(request_outputs) -> bool
Source code in vllm/engine/async_llm_engine.py
def process_request_outputs(self, request_outputs) -> bool:
    # Put the outputs into the corresponding streams.
    all_finished = True
    for request_output in request_outputs:
        self._request_tracker.process_request_output(
            request_output, verbose=self.log_requests)
        all_finished = all_finished and request_output.finished

    return all_finished

remove_logger

remove_logger(logger_name: str) -> None
Source code in vllm/engine/async_llm_engine.py
def remove_logger(self, logger_name: str) -> None:
    self.engine.remove_logger(logger_name=logger_name)

reset_mm_cache async

reset_mm_cache() -> None
Source code in vllm/engine/async_llm_engine.py
async def reset_mm_cache(self) -> None:
    self.engine.reset_mm_cache()

reset_prefix_cache async

reset_prefix_cache(device: Optional[Device] = None) -> None
Source code in vllm/engine/async_llm_engine.py
async def reset_prefix_cache(self,
                             device: Optional[Device] = None) -> None:
    self.engine.reset_prefix_cache(device)

run_engine_loop async staticmethod

run_engine_loop(engine_ref: ReferenceType)

We use a weakref to the engine so that the running loop doesn't prevent the engine being garbage collected.

Source code in vllm/engine/async_llm_engine.py
@staticmethod
async def run_engine_loop(engine_ref: ReferenceType):
    """We use a weakref to the engine so that the running loop
    doesn't prevent the engine being garbage collected."""
    engine: Optional[AsyncLLMEngine] = engine_ref()
    if not engine:
        return

    pipeline_parallel_size = \
            engine.engine.parallel_config.pipeline_parallel_size
    has_requests_in_progress = [False] * pipeline_parallel_size
    while True:
        if not any(has_requests_in_progress):
            logger.debug("Waiting for new requests...")
            # Stop the execute model loop in parallel workers until there
            # are more requests to process. This avoids waiting
            # indefinitely in torch.distributed ops which may otherwise
            # timeout, and unblocks the RPC thread in the workers so that
            # they can process any other queued control plane messages,
            # such as add/remove lora adapters.
            await engine.engine.stop_remote_worker_execution_loop_async()
            request_tracker = engine._request_tracker
            # Allow engine to be garbage collected while
            # waiting for new requests
            del engine
            await asyncio.sleep(0)
            if engine_ref() is None:
                return
            await request_tracker.wait_for_new_requests()
            engine = engine_ref()
            if not engine:
                return
            logger.debug("Got new requests!")
            requests_in_progress = [
                asyncio.create_task(engine.engine_step(ve))
                for ve in range(pipeline_parallel_size)
            ]
            has_requests_in_progress = [True] * pipeline_parallel_size

        # Abort if iteration takes too long due to unrecoverable errors
        # (eg. NCCL timeouts).
        try:
            async with asyncio_timeout(ENGINE_ITERATION_TIMEOUT_S):
                done, _ = await asyncio.wait(
                    requests_in_progress,
                    return_when=asyncio.FIRST_COMPLETED)
                for _ in range(pipeline_parallel_size):
                    await asyncio.sleep(0)
            for task in done:
                result = task.result()
                virtual_engine = requests_in_progress.index(task)
                has_unfinished_requests = (
                    engine.engine.
                    has_unfinished_requests_for_virtual_engine(
                        virtual_engine))
                if result or has_unfinished_requests:
                    requests_in_progress[virtual_engine] = (
                        asyncio.create_task(
                            engine.engine_step(virtual_engine)))
                    has_requests_in_progress[virtual_engine] = True
                else:
                    has_requests_in_progress[virtual_engine] = False
        except asyncio.TimeoutError as exc:
            logger.error(
                "Engine iteration timed out. This should never happen!")
            engine.set_errored(exc)
            raise
        await asyncio.sleep(0)

set_errored

set_errored(exc: Exception) -> None
Source code in vllm/engine/async_llm_engine.py
def set_errored(self, exc: Exception) -> None:
    self._errored_with = exc

shutdown_background_loop

shutdown_background_loop() -> None

Shut down the background loop.

This method needs to be called during cleanup to remove references to self and properly GC the resources held by the async LLM engine (e.g., the executors as well as their resources).

Source code in vllm/engine/async_llm_engine.py
def shutdown_background_loop(self) -> None:
    """
    Shut down the background loop.

    This method needs to be called during cleanup to remove
    references to `self` and properly GC the resources held
    by the async LLM engine (e.g., the executors as well as
    their resources).
    """
    if self._background_loop_unshielded is not None:
        self._background_loop_unshielded.cancel()
        self._background_loop_unshielded = None
    self.background_loop = None

sleep async

sleep(level: int = 1) -> None
Source code in vllm/engine/async_llm_engine.py
async def sleep(self, level: int = 1) -> None:
    await self.reset_prefix_cache()
    self.engine.sleep(level)

start_background_loop

start_background_loop() -> None

Start the background loop.

Source code in vllm/engine/async_llm_engine.py
def start_background_loop(self) -> None:
    """Start the background loop."""
    if self.errored:
        raise AsyncEngineDeadError(
            "Background loop has errored already.") from self._errored_with
    if self.is_running:
        raise RuntimeError("Background loop is already running.")
    # Initialize the RequestTracker here so it uses the right event loop.
    self._request_tracker = RequestTracker()

    self._background_loop_unshielded = asyncio.get_event_loop(
    ).create_task(self.run_engine_loop(weakref.ref(self)))
    self._background_loop_unshielded.add_done_callback(
        partial(_log_task_completion, error_callback=self._error_callback))
    self.background_loop = asyncio.shield(self._background_loop_unshielded)

start_profile async

start_profile() -> None
Source code in vllm/engine/async_llm_engine.py
async def start_profile(self) -> None:
    self.engine.start_profile()

stop_profile async

stop_profile() -> None
Source code in vllm/engine/async_llm_engine.py
async def stop_profile(self) -> None:
    self.engine.stop_profile()

wake_up async

wake_up(tags: Optional[list[str]] = None) -> None
Source code in vllm/engine/async_llm_engine.py
async def wake_up(self, tags: Optional[list[str]] = None) -> None:
    self.engine.wake_up(tags)

ClassificationOutput dataclass

The output data of one classification output of a request.

Parameters:

Name Type Description Default
probs list[float]

The probability vector, which is a list of floats.

required
Source code in vllm/outputs.py
@dataclass
class ClassificationOutput:
    """The output data of one classification output of a request.

    Args:
        probs: The probability vector, which is a list of floats.
        Its length depends on the number of classes.
    """
    probs: list[float]

    @staticmethod
    def from_base(pooling_output: PoolingOutput):
        # pooling_output shape: (num_classes)
        pooled_data = pooling_output.data
        if pooled_data.ndim != 1:
            raise ValueError("pooled_data should be a 1-D probability vector")

        return ClassificationOutput(pooled_data.tolist())

    @property
    def num_classes(self) -> int:
        return len(self.probs)

    def __repr__(self) -> str:
        return f"ClassificationOutput(num_classes={self.num_classes})"

num_classes property

num_classes: int

probs instance-attribute

probs: list[float]

__init__

__init__(probs: list[float]) -> None

__repr__

__repr__() -> str
Source code in vllm/outputs.py
def __repr__(self) -> str:
    return f"ClassificationOutput(num_classes={self.num_classes})"

from_base staticmethod

from_base(pooling_output: PoolingOutput)
Source code in vllm/outputs.py
@staticmethod
def from_base(pooling_output: PoolingOutput):
    # pooling_output shape: (num_classes)
    pooled_data = pooling_output.data
    if pooled_data.ndim != 1:
        raise ValueError("pooled_data should be a 1-D probability vector")

    return ClassificationOutput(pooled_data.tolist())

ClassificationRequestOutput

Bases: PoolingRequestOutput[ClassificationOutput]

Source code in vllm/outputs.py
class ClassificationRequestOutput(PoolingRequestOutput[ClassificationOutput]):

    @staticmethod
    def from_base(request_output: PoolingRequestOutput):
        return ClassificationRequestOutput(
            request_id=request_output.request_id,
            outputs=ClassificationOutput.from_base(request_output.outputs),
            prompt_token_ids=request_output.prompt_token_ids,
            finished=request_output.finished,
        )

from_base staticmethod

from_base(request_output: PoolingRequestOutput)
Source code in vllm/outputs.py
@staticmethod
def from_base(request_output: PoolingRequestOutput):
    return ClassificationRequestOutput(
        request_id=request_output.request_id,
        outputs=ClassificationOutput.from_base(request_output.outputs),
        prompt_token_ids=request_output.prompt_token_ids,
        finished=request_output.finished,
    )

CompletionOutput dataclass

The output data of one completion output of a request.

Parameters:

Name Type Description Default
index int

The index of the output in the request.

required
text str

The generated output text.

required
token_ids Sequence[int]

The token IDs of the generated output text.

required
cumulative_logprob Optional[float]

The cumulative log probability of the generated output text.

required
logprobs Optional[SampleLogprobs]

The log probabilities of the top probability words at each position if the logprobs are requested.

required
finish_reason Optional[str]

The reason why the sequence is finished.

None
stop_reason Union[int, str, None]

The stop string or token id that caused the completion to stop, None if the completion finished for some other reason including encountering the EOS token.

None
lora_request Optional[LoRARequest]

The LoRA request that was used to generate the output.

None
Source code in vllm/outputs.py
@dataclass
class CompletionOutput:
    """The output data of one completion output of a request.

    Args:
        index: The index of the output in the request.
        text: The generated output text.
        token_ids: The token IDs of the generated output text.
        cumulative_logprob: The cumulative log probability of the generated
            output text.
        logprobs: The log probabilities of the top probability words at each
            position if the logprobs are requested.
        finish_reason: The reason why the sequence is finished.
        stop_reason: The stop string or token id that caused the completion
            to stop, None if the completion finished for some other reason
            including encountering the EOS token.
        lora_request: The LoRA request that was used to generate the output.
    """

    index: int
    text: str
    token_ids: GenericSequence[int]
    cumulative_logprob: Optional[float]
    logprobs: Optional[SampleLogprobs]
    finish_reason: Optional[str] = None
    stop_reason: Union[int, str, None] = None
    lora_request: Optional[LoRARequest] = None

    def finished(self) -> bool:
        return self.finish_reason is not None

    def __repr__(self) -> str:
        return (f"CompletionOutput(index={self.index}, "
                f"text={self.text!r}, "
                f"token_ids={self.token_ids}, "
                f"cumulative_logprob={self.cumulative_logprob}, "
                f"logprobs={self.logprobs}, "
                f"finish_reason={self.finish_reason}, "
                f"stop_reason={self.stop_reason})")

cumulative_logprob instance-attribute

cumulative_logprob: Optional[float]

finish_reason class-attribute instance-attribute

finish_reason: Optional[str] = None

index instance-attribute

index: int

logprobs instance-attribute

lora_request class-attribute instance-attribute

lora_request: Optional[LoRARequest] = None

stop_reason class-attribute instance-attribute

stop_reason: Union[int, str, None] = None

text instance-attribute

text: str

token_ids instance-attribute

token_ids: Sequence[int]

__init__

__init__(
    index: int,
    text: str,
    token_ids: Sequence[int],
    cumulative_logprob: Optional[float],
    logprobs: Optional[SampleLogprobs],
    finish_reason: Optional[str] = None,
    stop_reason: Union[int, str, None] = None,
    lora_request: Optional[LoRARequest] = None,
) -> None

__repr__

__repr__() -> str
Source code in vllm/outputs.py
def __repr__(self) -> str:
    return (f"CompletionOutput(index={self.index}, "
            f"text={self.text!r}, "
            f"token_ids={self.token_ids}, "
            f"cumulative_logprob={self.cumulative_logprob}, "
            f"logprobs={self.logprobs}, "
            f"finish_reason={self.finish_reason}, "
            f"stop_reason={self.stop_reason})")

finished

finished() -> bool
Source code in vllm/outputs.py
def finished(self) -> bool:
    return self.finish_reason is not None

EmbeddingOutput dataclass

The output data of one embedding output of a request.

Parameters:

Name Type Description Default
embedding list[float]

The embedding vector, which is a list of floats.

required
Source code in vllm/outputs.py
@dataclass
class EmbeddingOutput:
    """The output data of one embedding output of a request.

    Args:
        embedding: The embedding vector, which is a list of floats.
        Its length depends on the hidden dimension of the model.
    """
    embedding: list[float]

    @staticmethod
    def from_base(pooling_output: PoolingOutput):
        pooled_data = pooling_output.data
        if pooled_data.ndim != 1:
            raise ValueError("pooled_data should be a 1-D embedding vector")

        return EmbeddingOutput(pooled_data.tolist())

    @property
    def hidden_size(self) -> int:
        return len(self.embedding)

    def __repr__(self) -> str:
        return f"EmbeddingOutput(hidden_size={self.hidden_size})"

embedding instance-attribute

embedding: list[float]

hidden_size property

hidden_size: int

__init__

__init__(embedding: list[float]) -> None

__repr__

__repr__() -> str
Source code in vllm/outputs.py
def __repr__(self) -> str:
    return f"EmbeddingOutput(hidden_size={self.hidden_size})"

from_base staticmethod

from_base(pooling_output: PoolingOutput)
Source code in vllm/outputs.py
@staticmethod
def from_base(pooling_output: PoolingOutput):
    pooled_data = pooling_output.data
    if pooled_data.ndim != 1:
        raise ValueError("pooled_data should be a 1-D embedding vector")

    return EmbeddingOutput(pooled_data.tolist())

EmbeddingRequestOutput

Bases: PoolingRequestOutput[EmbeddingOutput]

Source code in vllm/outputs.py
class EmbeddingRequestOutput(PoolingRequestOutput[EmbeddingOutput]):

    @staticmethod
    def from_base(request_output: PoolingRequestOutput):
        return EmbeddingRequestOutput(
            request_id=request_output.request_id,
            outputs=EmbeddingOutput.from_base(request_output.outputs),
            prompt_token_ids=request_output.prompt_token_ids,
            finished=request_output.finished,
        )

from_base staticmethod

from_base(request_output: PoolingRequestOutput)
Source code in vllm/outputs.py
@staticmethod
def from_base(request_output: PoolingRequestOutput):
    return EmbeddingRequestOutput(
        request_id=request_output.request_id,
        outputs=EmbeddingOutput.from_base(request_output.outputs),
        prompt_token_ids=request_output.prompt_token_ids,
        finished=request_output.finished,
    )

EngineArgs dataclass

Arguments for vLLM engine.

Source code in vllm/engine/arg_utils.py
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
@dataclass
class EngineArgs:
    """Arguments for vLLM engine."""
    model: str = ModelConfig.model
    served_model_name: Optional[Union[
        str, List[str]]] = ModelConfig.served_model_name
    tokenizer: Optional[str] = ModelConfig.tokenizer
    hf_config_path: Optional[str] = ModelConfig.hf_config_path
    runner: RunnerOption = ModelConfig.runner
    convert: ConvertOption = ModelConfig.convert
    task: Optional[TaskOption] = ModelConfig.task
    skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
    enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
    tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
    trust_remote_code: bool = ModelConfig.trust_remote_code
    allowed_local_media_path: str = ModelConfig.allowed_local_media_path
    download_dir: Optional[str] = LoadConfig.download_dir
    load_format: Union[str, LoadFormats] = LoadConfig.load_format
    config_format: str = ModelConfig.config_format
    dtype: ModelDType = ModelConfig.dtype
    kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
    seed: Optional[int] = ModelConfig.seed
    max_model_len: Optional[int] = ModelConfig.max_model_len
    cuda_graph_sizes: list[int] = get_field(SchedulerConfig,
                                            "cuda_graph_sizes")
    # Note: Specifying a custom executor backend by passing a class
    # is intended for expert use only. The API may change without
    # notice.
    distributed_executor_backend: Optional[Union[
        str, DistributedExecutorBackend,
        Type[ExecutorBase]]] = ParallelConfig.distributed_executor_backend
    # number of P/D disaggregation (or other disaggregation) workers
    pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
    tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
    data_parallel_size: int = ParallelConfig.data_parallel_size
    data_parallel_rank: Optional[int] = None
    data_parallel_start_rank: Optional[int] = None
    data_parallel_size_local: Optional[int] = None
    data_parallel_address: Optional[str] = None
    data_parallel_rpc_port: Optional[int] = None
    data_parallel_hybrid_lb: bool = False
    data_parallel_backend: str = ParallelConfig.data_parallel_backend
    enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
    eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config")
    enable_eplb: bool = ParallelConfig.enable_eplb
    num_redundant_experts: int = EPLBConfig.num_redundant_experts
    eplb_window_size: int = EPLBConfig.window_size
    eplb_step_interval: int = EPLBConfig.step_interval
    eplb_log_balancedness: bool = EPLBConfig.log_balancedness
    max_parallel_loading_workers: Optional[
        int] = ParallelConfig.max_parallel_loading_workers
    block_size: Optional[BlockSize] = CacheConfig.block_size
    enable_prefix_caching: Optional[bool] = CacheConfig.enable_prefix_caching
    prefix_caching_hash_algo: PrefixCachingHashAlgo = \
        CacheConfig.prefix_caching_hash_algo
    disable_sliding_window: bool = ModelConfig.disable_sliding_window
    disable_cascade_attn: bool = ModelConfig.disable_cascade_attn
    swap_space: float = CacheConfig.swap_space
    cpu_offload_gb: float = CacheConfig.cpu_offload_gb
    gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
    max_num_batched_tokens: Optional[
        int] = SchedulerConfig.max_num_batched_tokens
    max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
    max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills
    long_prefill_token_threshold: int = \
        SchedulerConfig.long_prefill_token_threshold
    max_num_seqs: Optional[int] = SchedulerConfig.max_num_seqs
    max_logprobs: int = ModelConfig.max_logprobs
    logprobs_mode: LogprobsMode = ModelConfig.logprobs_mode
    disable_log_stats: bool = False
    revision: Optional[str] = ModelConfig.revision
    code_revision: Optional[str] = ModelConfig.code_revision
    rope_scaling: dict[str, Any] = get_field(ModelConfig, "rope_scaling")
    rope_theta: Optional[float] = ModelConfig.rope_theta
    hf_token: Optional[Union[bool, str]] = ModelConfig.hf_token
    hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
    tokenizer_revision: Optional[str] = ModelConfig.tokenizer_revision
    quantization: Optional[QuantizationMethods] = ModelConfig.quantization
    enforce_eager: bool = ModelConfig.enforce_eager
    max_seq_len_to_capture: int = ModelConfig.max_seq_len_to_capture
    disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
    limit_mm_per_prompt: dict[str, int] = \
        get_field(MultiModalConfig, "limit_per_prompt")
    interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
    media_io_kwargs: dict[str, dict[str,
                                    Any]] = get_field(MultiModalConfig,
                                                      "media_io_kwargs")
    mm_processor_kwargs: Optional[Dict[str, Any]] = \
        MultiModalConfig.mm_processor_kwargs
    disable_mm_preprocessor_cache: bool = False  # DEPRECATED
    mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb
    mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode
    skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
    # LoRA fields
    enable_lora: bool = False
    enable_lora_bias: bool = LoRAConfig.bias_enabled
    max_loras: int = LoRAConfig.max_loras
    max_lora_rank: int = LoRAConfig.max_lora_rank
    default_mm_loras: Optional[Dict[str, str]] = \
        LoRAConfig.default_mm_loras
    fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
    max_cpu_loras: Optional[int] = LoRAConfig.max_cpu_loras
    lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype
    lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size

    ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
    num_gpu_blocks_override: Optional[
        int] = CacheConfig.num_gpu_blocks_override
    num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots
    model_loader_extra_config: dict = \
        get_field(LoadConfig, "model_loader_extra_config")
    ignore_patterns: Optional[Union[str,
                                    List[str]]] = LoadConfig.ignore_patterns
    preemption_mode: Optional[str] = SchedulerConfig.preemption_mode

    scheduler_delay_factor: float = SchedulerConfig.delay_factor
    enable_chunked_prefill: Optional[
        bool] = SchedulerConfig.enable_chunked_prefill
    disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input

    disable_hybrid_kv_cache_manager: bool = (
        SchedulerConfig.disable_hybrid_kv_cache_manager)

    guided_decoding_backend: GuidedDecodingBackend = DecodingConfig.backend
    guided_decoding_disable_fallback: bool = DecodingConfig.disable_fallback
    guided_decoding_disable_any_whitespace: bool = \
        DecodingConfig.disable_any_whitespace
    guided_decoding_disable_additional_properties: bool = \
        DecodingConfig.disable_additional_properties
    logits_processor_pattern: Optional[
        str] = ModelConfig.logits_processor_pattern

    speculative_config: Optional[Dict[str, Any]] = None

    show_hidden_metrics_for_version: Optional[str] = \
        ObservabilityConfig.show_hidden_metrics_for_version
    otlp_traces_endpoint: Optional[str] = \
        ObservabilityConfig.otlp_traces_endpoint
    collect_detailed_traces: Optional[list[DetailedTraceModules]] = \
        ObservabilityConfig.collect_detailed_traces
    disable_async_output_proc: bool = not ModelConfig.use_async_output_proc
    scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
    scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls

    override_neuron_config: dict[str, Any] = \
        get_field(ModelConfig, "override_neuron_config")
    override_pooler_config: Optional[Union[dict, PoolerConfig]] = \
        ModelConfig.override_pooler_config
    compilation_config: CompilationConfig = \
        get_field(VllmConfig, "compilation_config")
    worker_cls: str = ParallelConfig.worker_cls
    worker_extension_cls: str = ParallelConfig.worker_extension_cls

    kv_transfer_config: Optional[KVTransferConfig] = None
    kv_events_config: Optional[KVEventsConfig] = None

    generation_config: str = ModelConfig.generation_config
    enable_sleep_mode: bool = ModelConfig.enable_sleep_mode
    override_generation_config: dict[str, Any] = \
        get_field(ModelConfig, "override_generation_config")
    model_impl: str = ModelConfig.model_impl
    override_attention_dtype: str = ModelConfig.override_attention_dtype

    calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
    mamba_cache_dtype: MambaDType = CacheConfig.mamba_cache_dtype
    mamba_ssm_cache_dtype: MambaDType = CacheConfig.mamba_ssm_cache_dtype

    additional_config: dict[str, Any] = \
        get_field(VllmConfig, "additional_config")
    reasoning_parser: str = DecodingConfig.reasoning_backend

    use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
    pt_load_map_location: str = LoadConfig.pt_load_map_location

    # DEPRECATED
    enable_multimodal_encoder_data_parallel: bool = False

    logits_processors: Optional[list[Union[
        str, type[LogitsProcessor]]]] = ModelConfig.logits_processors
    """Custom logitproc types"""

    async_scheduling: bool = SchedulerConfig.async_scheduling

    kv_sharing_fast_prefill: bool = \
        CacheConfig.kv_sharing_fast_prefill

    def __post_init__(self):
        # support `EngineArgs(compilation_config={...})`
        # without having to manually construct a
        # CompilationConfig object
        if isinstance(self.compilation_config, dict):
            self.compilation_config = CompilationConfig(
                **self.compilation_config)
        if isinstance(self.eplb_config, dict):
            self.eplb_config = EPLBConfig(**self.eplb_config)
        # Setup plugins
        from vllm.plugins import load_general_plugins
        load_general_plugins()
        # when use hf offline,replace model id to local model path
        if huggingface_hub.constants.HF_HUB_OFFLINE:
            model_id = self.model
            self.model = get_model_path(self.model, self.revision)
            logger.info(
                "HF_HUB_OFFLINE is True, replace model_id [%s] " \
                "to model_path [%s]",model_id, self.model)

    @staticmethod
    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
        """Shared CLI arguments for vLLM engine."""

        # Model arguments
        model_kwargs = get_kwargs(ModelConfig)
        model_group = parser.add_argument_group(
            title="ModelConfig",
            description=ModelConfig.__doc__,
        )
        if not ('serve' in sys.argv[1:] and '--help' in sys.argv[1:]):
            model_group.add_argument("--model", **model_kwargs["model"])
        model_group.add_argument("--runner", **model_kwargs["runner"])
        model_group.add_argument("--convert", **model_kwargs["convert"])
        model_group.add_argument("--task",
                                 **model_kwargs["task"],
                                 deprecated=True)
        model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
        model_group.add_argument("--tokenizer-mode",
                                 **model_kwargs["tokenizer_mode"])
        model_group.add_argument("--trust-remote-code",
                                 **model_kwargs["trust_remote_code"])
        model_group.add_argument("--dtype", **model_kwargs["dtype"])
        model_group.add_argument("--seed", **model_kwargs["seed"])
        model_group.add_argument("--hf-config-path",
                                 **model_kwargs["hf_config_path"])
        model_group.add_argument("--allowed-local-media-path",
                                 **model_kwargs["allowed_local_media_path"])
        model_group.add_argument("--revision", **model_kwargs["revision"])
        model_group.add_argument("--code-revision",
                                 **model_kwargs["code_revision"])
        model_group.add_argument("--rope-scaling",
                                 **model_kwargs["rope_scaling"])
        model_group.add_argument("--rope-theta", **model_kwargs["rope_theta"])
        model_group.add_argument("--tokenizer-revision",
                                 **model_kwargs["tokenizer_revision"])
        model_group.add_argument("--max-model-len",
                                 **model_kwargs["max_model_len"])
        model_group.add_argument("--quantization", "-q",
                                 **model_kwargs["quantization"])
        model_group.add_argument("--enforce-eager",
                                 **model_kwargs["enforce_eager"])
        model_group.add_argument("--max-seq-len-to-capture",
                                 **model_kwargs["max_seq_len_to_capture"])
        model_group.add_argument("--max-logprobs",
                                 **model_kwargs["max_logprobs"])
        model_group.add_argument("--logprobs-mode",
                                 choices=[f.value for f in LogprobsMode],
                                 **model_kwargs["logprobs_mode"])
        model_group.add_argument("--disable-sliding-window",
                                 **model_kwargs["disable_sliding_window"])
        model_group.add_argument("--disable-cascade-attn",
                                 **model_kwargs["disable_cascade_attn"])
        model_group.add_argument("--skip-tokenizer-init",
                                 **model_kwargs["skip_tokenizer_init"])
        model_group.add_argument("--enable-prompt-embeds",
                                 **model_kwargs["enable_prompt_embeds"])
        model_group.add_argument("--served-model-name",
                                 **model_kwargs["served_model_name"])
        # This one is a special case because it is the
        # opposite of ModelConfig.use_async_output_proc
        model_group.add_argument(
            "--disable-async-output-proc",
            action="store_true",
            default=EngineArgs.disable_async_output_proc,
            help="Disable async output processing. This may result in "
            "lower performance.")
        model_group.add_argument("--config-format",
                                 choices=[f.value for f in ConfigFormat],
                                 **model_kwargs["config_format"])
        # This one is a special case because it can bool
        # or str. TODO: Handle this in get_kwargs
        model_group.add_argument("--hf-token",
                                 type=str,
                                 nargs="?",
                                 const=True,
                                 default=model_kwargs["hf_token"]["default"],
                                 help=model_kwargs["hf_token"]["help"])
        model_group.add_argument("--hf-overrides",
                                 **model_kwargs["hf_overrides"])
        model_group.add_argument("--override-neuron-config",
                                 **model_kwargs["override_neuron_config"])
        model_group.add_argument("--override-pooler-config",
                                 **model_kwargs["override_pooler_config"])
        model_group.add_argument("--logits-processor-pattern",
                                 **model_kwargs["logits_processor_pattern"])
        model_group.add_argument("--generation-config",
                                 **model_kwargs["generation_config"])
        model_group.add_argument("--override-generation-config",
                                 **model_kwargs["override_generation_config"])
        model_group.add_argument("--enable-sleep-mode",
                                 **model_kwargs["enable_sleep_mode"])
        model_group.add_argument("--model-impl",
                                 choices=[f.value for f in ModelImpl],
                                 **model_kwargs["model_impl"])
        model_group.add_argument("--override-attention-dtype",
                                 **model_kwargs["override_attention_dtype"])
        model_group.add_argument("--logits-processors",
                                 **model_kwargs["logits_processors"])

        # Model loading arguments
        load_kwargs = get_kwargs(LoadConfig)
        load_group = parser.add_argument_group(
            title="LoadConfig",
            description=LoadConfig.__doc__,
        )
        load_group.add_argument("--load-format", **load_kwargs["load_format"])
        load_group.add_argument("--download-dir",
                                **load_kwargs["download_dir"])
        load_group.add_argument("--model-loader-extra-config",
                                **load_kwargs["model_loader_extra_config"])
        load_group.add_argument("--ignore-patterns",
                                **load_kwargs["ignore_patterns"])
        load_group.add_argument("--use-tqdm-on-load",
                                **load_kwargs["use_tqdm_on_load"])
        load_group.add_argument('--pt-load-map-location',
                                **load_kwargs["pt_load_map_location"])

        # Guided decoding arguments
        guided_decoding_kwargs = get_kwargs(DecodingConfig)
        guided_decoding_group = parser.add_argument_group(
            title="DecodingConfig",
            description=DecodingConfig.__doc__,
        )
        guided_decoding_group.add_argument("--guided-decoding-backend",
                                           **guided_decoding_kwargs["backend"])
        guided_decoding_group.add_argument(
            "--guided-decoding-disable-fallback",
            **guided_decoding_kwargs["disable_fallback"])
        guided_decoding_group.add_argument(
            "--guided-decoding-disable-any-whitespace",
            **guided_decoding_kwargs["disable_any_whitespace"])
        guided_decoding_group.add_argument(
            "--guided-decoding-disable-additional-properties",
            **guided_decoding_kwargs["disable_additional_properties"])
        guided_decoding_group.add_argument(
            "--reasoning-parser",
            # This choice is a special case because it's not static
            choices=list(ReasoningParserManager.reasoning_parsers),
            **guided_decoding_kwargs["reasoning_backend"])

        # Parallel arguments
        parallel_kwargs = get_kwargs(ParallelConfig)
        parallel_group = parser.add_argument_group(
            title="ParallelConfig",
            description=ParallelConfig.__doc__,
        )
        parallel_group.add_argument(
            "--distributed-executor-backend",
            **parallel_kwargs["distributed_executor_backend"])
        parallel_group.add_argument(
            "--pipeline-parallel-size", "-pp",
            **parallel_kwargs["pipeline_parallel_size"])
        parallel_group.add_argument("--tensor-parallel-size", "-tp",
                                    **parallel_kwargs["tensor_parallel_size"])
        parallel_group.add_argument("--data-parallel-size", "-dp",
                                    **parallel_kwargs["data_parallel_size"])
        parallel_group.add_argument(
            '--data-parallel-rank',
            '-dpn',
            type=int,
            help='Data parallel rank of this instance. '
            'When set, enables external load balancer mode.')
        parallel_group.add_argument('--data-parallel-start-rank',
                                    '-dpr',
                                    type=int,
                                    help='Starting data parallel rank '
                                    'for secondary nodes.')
        parallel_group.add_argument('--data-parallel-size-local',
                                    '-dpl',
                                    type=int,
                                    help='Number of data parallel replicas '
                                    'to run on this node.')
        parallel_group.add_argument('--data-parallel-address',
                                    '-dpa',
                                    type=str,
                                    help='Address of data parallel cluster '
                                    'head-node.')
        parallel_group.add_argument('--data-parallel-rpc-port',
                                    '-dpp',
                                    type=int,
                                    help='Port for data parallel RPC '
                                    'communication.')
        parallel_group.add_argument('--data-parallel-backend',
                                    '-dpb',
                                    type=str,
                                    default='mp',
                                    help='Backend for data parallel, either '
                                    '"mp" or "ray".')
        parallel_group.add_argument(
            "--data-parallel-hybrid-lb",
            **parallel_kwargs["data_parallel_hybrid_lb"])
        parallel_group.add_argument(
            "--enable-expert-parallel",
            **parallel_kwargs["enable_expert_parallel"])
        parallel_group.add_argument("--enable-eplb",
                                    **parallel_kwargs["enable_eplb"])
        parallel_group.add_argument("--eplb-config",
                                    **parallel_kwargs["eplb_config"])
        parallel_group.add_argument(
            "--num-redundant-experts",
            type=int,
            help=
            "[DEPRECATED] --num-redundant-experts will be removed in v0.12.0.",
            deprecated=True)
        parallel_group.add_argument(
            "--eplb-window-size",
            type=int,
            help="[DEPRECATED] --eplb-window-size will be removed in v0.12.0.",
            deprecated=True)
        parallel_group.add_argument(
            "--eplb-step-interval",
            type=int,
            help=
            "[DEPRECATED] --eplb-step-interval will be removed in v0.12.0.",
            deprecated=True)
        parallel_group.add_argument(
            "--eplb-log-balancedness",
            action=argparse.BooleanOptionalAction,
            help=
            "[DEPRECATED] --eplb-log-balancedness will be removed in v0.12.0.",
            deprecated=True)

        parallel_group.add_argument(
            "--max-parallel-loading-workers",
            **parallel_kwargs["max_parallel_loading_workers"])
        parallel_group.add_argument(
            "--ray-workers-use-nsight",
            **parallel_kwargs["ray_workers_use_nsight"])
        parallel_group.add_argument(
            "--disable-custom-all-reduce",
            **parallel_kwargs["disable_custom_all_reduce"])
        parallel_group.add_argument("--worker-cls",
                                    **parallel_kwargs["worker_cls"])
        parallel_group.add_argument("--worker-extension-cls",
                                    **parallel_kwargs["worker_extension_cls"])
        parallel_group.add_argument(
            "--enable-multimodal-encoder-data-parallel",
            action="store_true",
            deprecated=True)

        # KV cache arguments
        cache_kwargs = get_kwargs(CacheConfig)
        cache_group = parser.add_argument_group(
            title="CacheConfig",
            description=CacheConfig.__doc__,
        )
        cache_group.add_argument("--block-size", **cache_kwargs["block_size"])
        cache_group.add_argument("--gpu-memory-utilization",
                                 **cache_kwargs["gpu_memory_utilization"])
        cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"])
        cache_group.add_argument("--kv-cache-dtype",
                                 **cache_kwargs["cache_dtype"])
        cache_group.add_argument("--num-gpu-blocks-override",
                                 **cache_kwargs["num_gpu_blocks_override"])
        cache_group.add_argument("--enable-prefix-caching",
                                 **cache_kwargs["enable_prefix_caching"])
        cache_group.add_argument("--prefix-caching-hash-algo",
                                 **cache_kwargs["prefix_caching_hash_algo"])
        cache_group.add_argument("--cpu-offload-gb",
                                 **cache_kwargs["cpu_offload_gb"])
        cache_group.add_argument("--calculate-kv-scales",
                                 **cache_kwargs["calculate_kv_scales"])
        cache_group.add_argument("--kv-sharing-fast-prefill",
                                 **cache_kwargs["kv_sharing_fast_prefill"])
        cache_group.add_argument("--mamba-cache-dtype",
                                 **cache_kwargs["mamba_cache_dtype"])
        cache_group.add_argument("--mamba-ssm-cache-dtype",
                                 **cache_kwargs["mamba_ssm_cache_dtype"])

        # Multimodal related configs
        multimodal_kwargs = get_kwargs(MultiModalConfig)
        multimodal_group = parser.add_argument_group(
            title="MultiModalConfig",
            description=MultiModalConfig.__doc__,
        )
        multimodal_group.add_argument("--limit-mm-per-prompt",
                                      **multimodal_kwargs["limit_per_prompt"])
        multimodal_group.add_argument("--media-io-kwargs",
                                      **multimodal_kwargs["media_io_kwargs"])
        multimodal_group.add_argument(
            "--mm-processor-kwargs",
            **multimodal_kwargs["mm_processor_kwargs"])
        multimodal_group.add_argument(
            "--mm-processor-cache-gb",
            **multimodal_kwargs["mm_processor_cache_gb"])
        multimodal_group.add_argument("--disable-mm-preprocessor-cache",
                                      action="store_true",
                                      deprecated=True)
        multimodal_group.add_argument(
            "--mm-encoder-tp-mode", **multimodal_kwargs["mm_encoder_tp_mode"])
        multimodal_group.add_argument(
            "--interleave-mm-strings",
            **multimodal_kwargs["interleave_mm_strings"])
        multimodal_group.add_argument("--skip-mm-profiling",
                                      **multimodal_kwargs["skip_mm_profiling"])

        # LoRA related configs
        lora_kwargs = get_kwargs(LoRAConfig)
        lora_group = parser.add_argument_group(
            title="LoRAConfig",
            description=LoRAConfig.__doc__,
        )
        lora_group.add_argument(
            "--enable-lora",
            action=argparse.BooleanOptionalAction,
            help="If True, enable handling of LoRA adapters.")
        lora_group.add_argument("--enable-lora-bias",
                                **lora_kwargs["bias_enabled"])
        lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"])
        lora_group.add_argument("--max-lora-rank",
                                **lora_kwargs["max_lora_rank"])
        lora_group.add_argument("--lora-extra-vocab-size",
                                **lora_kwargs["lora_extra_vocab_size"])
        lora_group.add_argument(
            "--lora-dtype",
            **lora_kwargs["lora_dtype"],
        )
        lora_group.add_argument("--max-cpu-loras",
                                **lora_kwargs["max_cpu_loras"])
        lora_group.add_argument("--fully-sharded-loras",
                                **lora_kwargs["fully_sharded_loras"])
        lora_group.add_argument("--default-mm-loras",
                                **lora_kwargs["default_mm_loras"])

        # Observability arguments
        observability_kwargs = get_kwargs(ObservabilityConfig)
        observability_group = parser.add_argument_group(
            title="ObservabilityConfig",
            description=ObservabilityConfig.__doc__,
        )
        observability_group.add_argument(
            "--show-hidden-metrics-for-version",
            **observability_kwargs["show_hidden_metrics_for_version"])
        observability_group.add_argument(
            "--otlp-traces-endpoint",
            **observability_kwargs["otlp_traces_endpoint"])
        # TODO: generalise this special case
        choices = observability_kwargs["collect_detailed_traces"]["choices"]
        metavar = f"{{{','.join(choices)}}}"
        observability_kwargs["collect_detailed_traces"]["metavar"] = metavar
        observability_kwargs["collect_detailed_traces"]["choices"] += [
            ",".join(p)
            for p in permutations(get_args(DetailedTraceModules), r=2)
        ]
        observability_group.add_argument(
            "--collect-detailed-traces",
            **observability_kwargs["collect_detailed_traces"])

        # Scheduler arguments
        scheduler_kwargs = get_kwargs(SchedulerConfig)
        scheduler_group = parser.add_argument_group(
            title="SchedulerConfig",
            description=SchedulerConfig.__doc__,
        )
        scheduler_group.add_argument(
            "--max-num-batched-tokens",
            **scheduler_kwargs["max_num_batched_tokens"])
        scheduler_group.add_argument("--max-num-seqs",
                                     **scheduler_kwargs["max_num_seqs"])
        scheduler_group.add_argument(
            "--max-num-partial-prefills",
            **scheduler_kwargs["max_num_partial_prefills"])
        scheduler_group.add_argument(
            "--max-long-partial-prefills",
            **scheduler_kwargs["max_long_partial_prefills"])
        scheduler_group.add_argument('--cuda-graph-sizes',
                                     **scheduler_kwargs["cuda_graph_sizes"])
        scheduler_group.add_argument(
            "--long-prefill-token-threshold",
            **scheduler_kwargs["long_prefill_token_threshold"])
        scheduler_group.add_argument("--num-lookahead-slots",
                                     **scheduler_kwargs["num_lookahead_slots"])
        scheduler_group.add_argument("--scheduler-delay-factor",
                                     **scheduler_kwargs["delay_factor"])
        scheduler_group.add_argument("--preemption-mode",
                                     **scheduler_kwargs["preemption_mode"])
        # multi-step scheduling has been removed; corresponding arguments
        # are no longer supported.
        scheduler_group.add_argument("--scheduling-policy",
                                     **scheduler_kwargs["policy"])
        scheduler_group.add_argument(
            "--enable-chunked-prefill",
            **scheduler_kwargs["enable_chunked_prefill"])
        scheduler_group.add_argument(
            "--disable-chunked-mm-input",
            **scheduler_kwargs["disable_chunked_mm_input"])
        scheduler_group.add_argument("--scheduler-cls",
                                     **scheduler_kwargs["scheduler_cls"])
        scheduler_group.add_argument(
            "--disable-hybrid-kv-cache-manager",
            **scheduler_kwargs["disable_hybrid_kv_cache_manager"])
        scheduler_group.add_argument("--async-scheduling",
                                     **scheduler_kwargs["async_scheduling"])

        # vLLM arguments
        vllm_kwargs = get_kwargs(VllmConfig)
        vllm_group = parser.add_argument_group(
            title="VllmConfig",
            description=VllmConfig.__doc__,
        )
        # We construct SpeculativeConfig using fields from other configs in
        # create_engine_config. So we set the type to a JSON string here to
        # delay the Pydantic validation that comes with SpeculativeConfig.
        vllm_kwargs["speculative_config"]["type"] = optional_type(json.loads)
        vllm_group.add_argument("--speculative-config",
                                **vllm_kwargs["speculative_config"])
        vllm_group.add_argument("--kv-transfer-config",
                                **vllm_kwargs["kv_transfer_config"])
        vllm_group.add_argument('--kv-events-config',
                                **vllm_kwargs["kv_events_config"])
        vllm_group.add_argument("--compilation-config", "-O",
                                **vllm_kwargs["compilation_config"])
        vllm_group.add_argument("--additional-config",
                                **vllm_kwargs["additional_config"])

        # Other arguments
        parser.add_argument('--disable-log-stats',
                            action='store_true',
                            help='Disable logging statistics.')

        return parser

    @classmethod
    def from_cli_args(cls, args: argparse.Namespace):
        # Get the list of attributes of this dataclass.
        attrs = [attr.name for attr in dataclasses.fields(cls)]
        # Set the attributes from the parsed arguments.
        engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
        return engine_args

    def create_model_config(self) -> ModelConfig:
        # gguf file needs a specific model loader and doesn't use hf_repo
        if check_gguf_file(self.model):
            self.quantization = self.load_format = "gguf"

        # NOTE: This is to allow model loading from S3 in CI
        if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
                and self.model in MODELS_ON_S3 and self.load_format == "auto"):
            self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
            self.load_format = "runai_streamer"

        if self.disable_mm_preprocessor_cache:
            logger.warning(
                "`--disable-mm-preprocessor-cache` is deprecated "
                "and will be removed in v0.13. "
                "Please use `--mm-processor-cache-gb 0` instead.", )

            self.mm_processor_cache_gb = 0
        elif envs.VLLM_MM_INPUT_CACHE_GIB != 4:
            logger.warning(
                "VLLM_MM_INPUT_CACHE_GIB` is deprecated "
                "and will be removed in v0.13. "
                "Please use `--mm-processor-cache-gb %d` instead.",
                envs.VLLM_MM_INPUT_CACHE_GIB,
            )

            self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB

        if self.enable_multimodal_encoder_data_parallel:
            logger.warning(
                "--enable-multimodal-encoder-data-parallel` is deprecated "
                "and will be removed in v0.13. "
                "Please use `--mm-encoder-tp-mode data` instead.")

            self.mm_encoder_tp_mode = "data"

        return ModelConfig(
            model=self.model,
            hf_config_path=self.hf_config_path,
            runner=self.runner,
            convert=self.convert,
            task=self.task,
            tokenizer=self.tokenizer,
            tokenizer_mode=self.tokenizer_mode,
            trust_remote_code=self.trust_remote_code,
            allowed_local_media_path=self.allowed_local_media_path,
            dtype=self.dtype,
            seed=self.seed,
            revision=self.revision,
            code_revision=self.code_revision,
            rope_scaling=self.rope_scaling,
            rope_theta=self.rope_theta,
            hf_token=self.hf_token,
            hf_overrides=self.hf_overrides,
            tokenizer_revision=self.tokenizer_revision,
            max_model_len=self.max_model_len,
            quantization=self.quantization,
            enforce_eager=self.enforce_eager,
            max_seq_len_to_capture=self.max_seq_len_to_capture,
            max_logprobs=self.max_logprobs,
            logprobs_mode=self.logprobs_mode,
            disable_sliding_window=self.disable_sliding_window,
            disable_cascade_attn=self.disable_cascade_attn,
            skip_tokenizer_init=self.skip_tokenizer_init,
            enable_prompt_embeds=self.enable_prompt_embeds,
            served_model_name=self.served_model_name,
            limit_mm_per_prompt=self.limit_mm_per_prompt,
            interleave_mm_strings=self.interleave_mm_strings,
            media_io_kwargs=self.media_io_kwargs,
            skip_mm_profiling=self.skip_mm_profiling,
            use_async_output_proc=not self.disable_async_output_proc,
            config_format=self.config_format,
            mm_processor_kwargs=self.mm_processor_kwargs,
            mm_processor_cache_gb=self.mm_processor_cache_gb,
            mm_encoder_tp_mode=self.mm_encoder_tp_mode,
            override_neuron_config=self.override_neuron_config,
            override_pooler_config=self.override_pooler_config,
            logits_processor_pattern=self.logits_processor_pattern,
            generation_config=self.generation_config,
            override_generation_config=self.override_generation_config,
            enable_sleep_mode=self.enable_sleep_mode,
            model_impl=self.model_impl,
            override_attention_dtype=self.override_attention_dtype,
            logits_processors=self.logits_processors,
        )

    def validate_tensorizer_args(self):
        from vllm.model_executor.model_loader.tensorizer import (
            TensorizerConfig)
        for key in self.model_loader_extra_config:
            if key in TensorizerConfig._fields:
                self.model_loader_extra_config["tensorizer_config"][
                    key] = self.model_loader_extra_config[key]

    def create_load_config(self) -> LoadConfig:

        if self.quantization == "bitsandbytes":
            self.load_format = "bitsandbytes"

        if self.load_format == "tensorizer":
            if hasattr(self.model_loader_extra_config, "to_serializable"):
                self.model_loader_extra_config = (
                    self.model_loader_extra_config.to_serializable())
            self.model_loader_extra_config["tensorizer_config"] = {}
            self.model_loader_extra_config["tensorizer_config"][
                "tensorizer_dir"] = self.model
            self.validate_tensorizer_args()

        return LoadConfig(
            load_format=self.load_format,
            download_dir=self.download_dir,
            device="cpu"
            if is_online_quantization(self.quantization) else None,
            model_loader_extra_config=self.model_loader_extra_config,
            ignore_patterns=self.ignore_patterns,
            use_tqdm_on_load=self.use_tqdm_on_load,
            pt_load_map_location=self.pt_load_map_location,
        )

    def create_speculative_config(
        self,
        target_model_config: ModelConfig,
        target_parallel_config: ParallelConfig,
        enable_chunked_prefill: bool,
        disable_log_stats: bool,
    ) -> Optional["SpeculativeConfig"]:
        """Initializes and returns a SpeculativeConfig object based on
        `speculative_config`.

        This function utilizes `speculative_config` to create a
        SpeculativeConfig object. The `speculative_config` can either be
        provided as a JSON string input via CLI arguments or directly as a
        dictionary from the engine.
        """

        from vllm.transformers_utils.config import get_config
        from vllm.transformers_utils.configs.speculators.base import (
            SpeculatorsConfig)

        if self.speculative_config is None:
            hf_config = get_config(self.hf_config_path or self.model,
                                   self.trust_remote_code, self.revision,
                                   self.code_revision, self.config_format)

            # if loading a SpeculatorsConfig, load the specualtive_config
            # details from the config directly
            # no user input required / expected
            if isinstance(hf_config, SpeculatorsConfig):
                # We create one since we don't create one
                self.speculative_config = {}
                self.speculative_config[
                    "num_speculative_tokens"] = hf_config.num_lookahead_tokens
                self.speculative_config["model"] = self.model
                self.speculative_config["method"] = hf_config.method
            else:
                return None

        # Note(Shangming): These parameters are not obtained from the cli arg
        # '--speculative-config' and must be passed in when creating the engine
        # config.
        self.speculative_config.update({
            "target_model_config": target_model_config,
            "target_parallel_config": target_parallel_config,
            "enable_chunked_prefill": enable_chunked_prefill,
            "disable_log_stats": disable_log_stats,
        })
        return SpeculativeConfig(**self.speculative_config)

    def create_engine_config(
        self,
        usage_context: Optional[UsageContext] = None,
        headless: bool = False,
    ) -> VllmConfig:
        """
        Create the VllmConfig.

        NOTE: for autoselection of V0 vs V1 engine, we need to
        create the ModelConfig first, since ModelConfig's attrs
        (e.g. the model arch) are needed to make the decision.

        This function set VLLM_USE_V1=X if VLLM_USE_V1 is
        unspecified by the user.

        If VLLM_USE_V1 is specified by the user but the VllmConfig
        is incompatible, we raise an error.
        """
        current_platform.pre_register_and_update()

        device_config = DeviceConfig(
            device=cast(Device, current_platform.device_type))
        model_config = self.create_model_config()

        # * If VLLM_USE_V1 is unset, we enable V1 for "supported features"
        #   and fall back to V0 for experimental or unsupported features.
        # * If VLLM_USE_V1=1, we enable V1 for supported + experimental
        #   features and raise error for unsupported features.
        # * If VLLM_USE_V1=0, we disable V1.
        use_v1 = False
        try_v1 = envs.VLLM_USE_V1 or not envs.is_set("VLLM_USE_V1")
        if try_v1 and self._is_v1_supported_oracle(model_config):
            use_v1 = True

        # If user explicitly set VLLM_USE_V1, sanity check we respect it.
        if envs.is_set("VLLM_USE_V1"):
            assert use_v1 == envs.VLLM_USE_V1
        # Otherwise, set the VLLM_USE_V1 variable globally.
        else:
            envs.set_vllm_use_v1(use_v1)

        # Set default arguments for V0 or V1 Engine.
        if use_v1:
            self._set_default_args_v1(usage_context, model_config)
            # Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1
            if current_platform.is_cpu(
            ) and current_platform.get_cpu_architecture() in (
                    CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM):
                logger.info(
                    "Chunked prefill is not supported for ARM and POWER "
                    "and S390X CPUs; "
                    "disabling it for V1 backend.")
                self.enable_chunked_prefill = False
        else:
            self._set_default_args_v0(model_config)
        assert self.enable_chunked_prefill is not None

        if envs.VLLM_ATTENTION_BACKEND in [STR_DUAL_CHUNK_FLASH_ATTN_VAL]:
            assert self.enforce_eager, (
                "Cuda graph is not supported with DualChunkFlashAttention. "
                "To run the model in eager mode, set 'enforce_eager=True' "
                "or use '--enforce-eager' in the CLI.")
            assert current_platform.is_cuda(), (
                "DualChunkFlashAttention is only supported on CUDA platform.")
            assert not use_v1, (
                "DualChunkFlashAttention is not supported on V1 engine. "
                "To run the model in V0 engine, try set 'VLLM_USE_V1=0'")

        sliding_window: Optional[int] = None
        if not is_interleaved(model_config.hf_text_config):
            # Only set CacheConfig.sliding_window if the model is all sliding
            # window. Otherwise CacheConfig.sliding_window will override the
            # global layers in interleaved sliding window models.
            sliding_window = model_config.get_sliding_window()

        cache_config = CacheConfig(
            block_size=self.block_size,
            gpu_memory_utilization=self.gpu_memory_utilization,
            swap_space=self.swap_space,
            cache_dtype=self.kv_cache_dtype,
            is_attention_free=model_config.is_attention_free,
            num_gpu_blocks_override=self.num_gpu_blocks_override,
            sliding_window=sliding_window,
            enable_prefix_caching=self.enable_prefix_caching,
            prefix_caching_hash_algo=self.prefix_caching_hash_algo,
            cpu_offload_gb=self.cpu_offload_gb,
            calculate_kv_scales=self.calculate_kv_scales,
            kv_sharing_fast_prefill=self.kv_sharing_fast_prefill,
            mamba_cache_dtype=self.mamba_cache_dtype,
            mamba_ssm_cache_dtype=self.mamba_ssm_cache_dtype,
        )

        ray_runtime_env = None
        if is_ray_initialized():
            # Ray Serve LLM calls `create_engine_config` in the context
            # of a Ray task, therefore we check is_ray_initialized()
            # as opposed to is_in_ray_actor().
            import ray
            ray_runtime_env = ray.get_runtime_context().runtime_env
            logger.info("Using ray runtime env: %s", ray_runtime_env)

        # Get the current placement group if Ray is initialized and
        # we are in a Ray actor. If so, then the placement group will be
        # passed to spawned processes.
        placement_group = None
        if is_in_ray_actor():
            import ray

            # This call initializes Ray automatically if it is not initialized,
            # but we should not do this here.
            placement_group = ray.util.get_current_placement_group()

        assert not headless or not self.data_parallel_hybrid_lb, (
            "data_parallel_hybrid_lb is not applicable in "
            "headless mode")

        data_parallel_external_lb = self.data_parallel_rank is not None
        # Local DP rank = 1, use pure-external LB.
        if data_parallel_external_lb:
            assert self.data_parallel_size_local in (1, None), (
                "data_parallel_size_local must be 1 when data_parallel_rank "
                "is set")
            data_parallel_size_local = 1
            # Use full external lb if we have local_size of 1.
            self.data_parallel_hybrid_lb = False
        elif self.data_parallel_size_local is not None:
            data_parallel_size_local = self.data_parallel_size_local

            if self.data_parallel_start_rank and not headless:
                # Infer hybrid LB mode.
                self.data_parallel_hybrid_lb = True

            if self.data_parallel_hybrid_lb and data_parallel_size_local == 1:
                # Use full external lb if we have local_size of 1.
                data_parallel_external_lb = True
                self.data_parallel_hybrid_lb = False

            if data_parallel_size_local == self.data_parallel_size:
                # Disable hybrid LB mode if set for a single node
                self.data_parallel_hybrid_lb = False

            self.data_parallel_rank = self.data_parallel_start_rank or 0
        else:
            assert not self.data_parallel_hybrid_lb, (
                "data_parallel_size_local must be set to use "
                "data_parallel_hybrid_lb.")

            # Local DP size defaults to global DP size if not set.
            data_parallel_size_local = self.data_parallel_size

        # DP address, used in multi-node case for torch distributed group
        # and ZMQ sockets.
        if self.data_parallel_address is None:
            if self.data_parallel_backend == "ray":
                host_ip = get_ip()
                logger.info(
                    "Using host IP %s as ray-based data parallel address",
                    host_ip)
                data_parallel_address = host_ip
            else:
                assert self.data_parallel_backend == "mp", (
                    "data_parallel_backend can only be ray or mp, got %s",
                    self.data_parallel_backend)
                data_parallel_address = ParallelConfig.data_parallel_master_ip
        else:
            data_parallel_address = self.data_parallel_address

        # This port is only used when there are remote data parallel engines,
        # otherwise the local IPC transport is used.
        data_parallel_rpc_port = self.data_parallel_rpc_port if (
            self.data_parallel_rpc_port
            is not None) else ParallelConfig.data_parallel_rpc_port

        if self.async_scheduling:
            # Async scheduling does not work with the uniprocess backend.
            if self.distributed_executor_backend is None:
                self.distributed_executor_backend = "mp"
                logger.info("Using mp-based distributed executor backend "
                            "for async scheduling.")
            if self.distributed_executor_backend == "uni":
                raise ValueError("Async scheduling is not supported with "
                                 "uni-process backend.")
            if self.pipeline_parallel_size > 1:
                raise ValueError("Async scheduling is not supported with "
                                 "pipeline-parallel-size > 1.")

            # Currently, async scheduling does not support speculative decoding.
            # TODO(woosuk): Support it.
            if self.speculative_config is not None:
                raise ValueError(
                    "Currently, speculative decoding is not supported with "
                    "async scheduling.")

        # Forward the deprecated CLI args to the EPLB config.
        if self.num_redundant_experts is not None:
            self.eplb_config.num_redundant_experts = self.num_redundant_experts
        if self.eplb_window_size is not None:
            self.eplb_config.window_size = self.eplb_window_size
        if self.eplb_step_interval is not None:
            self.eplb_config.step_interval = self.eplb_step_interval
        if self.eplb_log_balancedness is not None:
            self.eplb_config.log_balancedness = self.eplb_log_balancedness

        parallel_config = ParallelConfig(
            pipeline_parallel_size=self.pipeline_parallel_size,
            tensor_parallel_size=self.tensor_parallel_size,
            data_parallel_size=self.data_parallel_size,
            data_parallel_rank=self.data_parallel_rank or 0,
            data_parallel_external_lb=data_parallel_external_lb,
            data_parallel_size_local=data_parallel_size_local,
            data_parallel_master_ip=data_parallel_address,
            data_parallel_rpc_port=data_parallel_rpc_port,
            data_parallel_backend=self.data_parallel_backend,
            data_parallel_hybrid_lb=self.data_parallel_hybrid_lb,
            enable_expert_parallel=self.enable_expert_parallel,
            enable_eplb=self.enable_eplb,
            eplb_config=self.eplb_config,
            max_parallel_loading_workers=self.max_parallel_loading_workers,
            disable_custom_all_reduce=self.disable_custom_all_reduce,
            ray_workers_use_nsight=self.ray_workers_use_nsight,
            ray_runtime_env=ray_runtime_env,
            placement_group=placement_group,
            distributed_executor_backend=self.distributed_executor_backend,
            worker_cls=self.worker_cls,
            worker_extension_cls=self.worker_extension_cls,
        )

        if model_config.is_multimodal_model:
            dp_supports_mm_processor_cache = (self.data_parallel_size == 1
                                              or data_parallel_external_lb)
            if (not dp_supports_mm_processor_cache
                    and model_config.mm_processor_cache_gb > 0):
                logger.warning(
                    "Multi-modal processor cache is disabled because "
                    "it is not compatible with data parallelism when "
                    "there does not exist a one-to-one correspondance "
                    "between API and engine core processes.")
                model_config.set_mm_processor_cache_gb(0)

        speculative_config = self.create_speculative_config(
            target_model_config=model_config,
            target_parallel_config=parallel_config,
            enable_chunked_prefill=self.enable_chunked_prefill,
            disable_log_stats=self.disable_log_stats,
        )

        # make sure num_lookahead_slots is set appropriately depending on
        # whether speculative decoding is enabled
        num_lookahead_slots = self.num_lookahead_slots
        if speculative_config is not None:
            num_lookahead_slots = speculative_config.num_lookahead_slots

        scheduler_config = SchedulerConfig(
            runner_type=model_config.runner_type,
            max_num_batched_tokens=self.max_num_batched_tokens,
            max_num_seqs=self.max_num_seqs,
            max_model_len=model_config.max_model_len,
            cuda_graph_sizes=self.cuda_graph_sizes,
            num_lookahead_slots=num_lookahead_slots,
            delay_factor=self.scheduler_delay_factor,
            enable_chunked_prefill=self.enable_chunked_prefill,
            disable_chunked_mm_input=self.disable_chunked_mm_input,
            is_multimodal_model=model_config.is_multimodal_model,
            preemption_mode=self.preemption_mode,
            send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
                             and parallel_config.use_ray),
            policy=self.scheduling_policy,
            scheduler_cls=self.scheduler_cls,
            max_num_partial_prefills=self.max_num_partial_prefills,
            max_long_partial_prefills=self.max_long_partial_prefills,
            long_prefill_token_threshold=self.long_prefill_token_threshold,
            disable_hybrid_kv_cache_manager=self.
            disable_hybrid_kv_cache_manager,
            async_scheduling=self.async_scheduling,
        )

        if not model_config.is_multimodal_model and self.default_mm_loras:
            raise ValueError(
                "Default modality-specific LoRA(s) were provided for a "
                "non multimodal model")

        lora_config = LoRAConfig(
            bias_enabled=self.enable_lora_bias,
            max_lora_rank=self.max_lora_rank,
            max_loras=self.max_loras,
            default_mm_loras=self.default_mm_loras,
            fully_sharded_loras=self.fully_sharded_loras,
            lora_extra_vocab_size=self.lora_extra_vocab_size,
            lora_dtype=self.lora_dtype,
            max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
            and self.max_cpu_loras > 0 else None) if self.enable_lora else None

        # bitsandbytes pre-quantized model need a specific model loader
        if model_config.quantization == "bitsandbytes":
            self.quantization = self.load_format = "bitsandbytes"

        load_config = self.create_load_config()

        decoding_config = DecodingConfig(
            backend=self.guided_decoding_backend,
            disable_fallback=self.guided_decoding_disable_fallback,
            disable_any_whitespace=self.guided_decoding_disable_any_whitespace,
            disable_additional_properties=\
                self.guided_decoding_disable_additional_properties,
            reasoning_backend=self.reasoning_parser
        )

        observability_config = ObservabilityConfig(
            show_hidden_metrics_for_version=(
                self.show_hidden_metrics_for_version),
            otlp_traces_endpoint=self.otlp_traces_endpoint,
            collect_detailed_traces=self.collect_detailed_traces,
        )

        config = VllmConfig(
            model_config=model_config,
            cache_config=cache_config,
            parallel_config=parallel_config,
            scheduler_config=scheduler_config,
            device_config=device_config,
            lora_config=lora_config,
            speculative_config=speculative_config,
            load_config=load_config,
            decoding_config=decoding_config,
            observability_config=observability_config,
            compilation_config=self.compilation_config,
            kv_transfer_config=self.kv_transfer_config,
            kv_events_config=self.kv_events_config,
            additional_config=self.additional_config,
        )

        return config

    def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
        """Oracle for whether to use V0 or V1 Engine by default."""

        #############################################################
        # Unsupported Feature Flags on V1.

        if self.load_format == "sharded_state":
            _raise_or_fallback(
                feature_name=f"--load_format {self.load_format}",
                recommend_to_remove=False)
            return False

        if (self.logits_processor_pattern
                != EngineArgs.logits_processor_pattern):
            _raise_or_fallback(feature_name="--logits-processor-pattern",
                               recommend_to_remove=False)
            return False

        if self.preemption_mode != SchedulerConfig.preemption_mode:
            _raise_or_fallback(feature_name="--preemption-mode",
                               recommend_to_remove=True)
            return False

        if (self.disable_async_output_proc
                != EngineArgs.disable_async_output_proc):
            _raise_or_fallback(feature_name="--disable-async-output-proc",
                               recommend_to_remove=True)
            return False

        if self.scheduler_delay_factor != SchedulerConfig.delay_factor:
            _raise_or_fallback(feature_name="--scheduler-delay-factor",
                               recommend_to_remove=True)
            return False

        # Triton v3.3 has f16 conversion regression issue on Turing and Volta,
        # which broke fp16 inference
        # see: https://github.com/triton-lang/triton/issues/6698
        if (current_platform.is_cuda()
                and not current_platform.has_device_capability(80)
                and model_config.dtype == torch.float16):
            _raise_or_fallback(
                feature_name="Compute Capability < 8.0 with FP16",
                recommend_to_remove=False)
            return False

        if self.kv_cache_dtype != "auto":
            supported = current_platform.is_kv_cache_dtype_supported(
                self.kv_cache_dtype, model_config)
            if not supported:
                _raise_or_fallback(feature_name="--kv-cache-dtype",
                                   recommend_to_remove=False)
                return False

        # No text embedding inputs so far.
        if self.enable_prompt_embeds:
            _raise_or_fallback(feature_name="--enable-prompt-embeds",
                               recommend_to_remove=False)
            return False

        # No Mamba or Encoder-Decoder so far.
        if not model_config.is_v1_compatible:
            _raise_or_fallback(feature_name=model_config.architectures,
                               recommend_to_remove=False)
            return False

        # V1 mamba models are unoptimized.
        if model_config.has_inner_state and _warn_or_fallback(
                feature_name="Mamba"):
            return False

        # No Concurrent Partial Prefills so far.
        if (self.max_num_partial_prefills
                != SchedulerConfig.max_num_partial_prefills
                or self.max_long_partial_prefills
                != SchedulerConfig.max_long_partial_prefills):
            _raise_or_fallback(feature_name="Concurrent Partial Prefill",
                               recommend_to_remove=False)
            return False

        # No OTLP observability so far.
        if (self.otlp_traces_endpoint or self.collect_detailed_traces):
            _raise_or_fallback(feature_name="--otlp-traces-endpoint",
                               recommend_to_remove=False)
            return False

        # V1 supports N-gram, Medusa, and Eagle speculative decoding.
        if (self.speculative_config is not None
                and self.speculative_config.get("method") == "draft_model"):
            raise NotImplementedError(
                "Speculative decoding with draft model is not supported yet. "
                "Please consider using other speculative decoding methods "
                "such as ngram, medusa, eagle, or deepseek_mtp.")

        V1_BACKENDS = [
            "FLASH_ATTN_VLLM_V1",
            "FLASH_ATTN",
            "PALLAS",
            "PALLAS_VLLM_V1",
            "TRITON_ATTN_VLLM_V1",
            "TRITON_MLA",
            "CUTLASS_MLA",
            "FLASHMLA",
            "FLASHINFER",
            "FLASHINFER_VLLM_V1",
            "ROCM_AITER_MLA",
            "TORCH_SDPA_VLLM_V1",
            "FLEX_ATTENTION",
            "TREE_ATTN",
            "XFORMERS_VLLM_V1",
        ]
        if (envs.is_set("VLLM_ATTENTION_BACKEND")
                and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
            name = f"VLLM_ATTENTION_BACKEND={envs.VLLM_ATTENTION_BACKEND}"
            _raise_or_fallback(feature_name=name, recommend_to_remove=True)
            return False

        # Platforms must decide if they can support v1 for this model
        if not current_platform.supports_v1(model_config=model_config):
            _raise_or_fallback(
                feature_name=f"device type={current_platform.device_type}",
                recommend_to_remove=False)
            return False
        #############################################################
        # Experimental Features - allow users to opt in.

        if self.pipeline_parallel_size > 1:
            supports_pp = getattr(self.distributed_executor_backend,
                                  'supports_pp', False)
            if not supports_pp and self.distributed_executor_backend not in (
                    ParallelConfig.distributed_executor_backend, "ray", "mp",
                    "external_launcher"):
                name = "Pipeline Parallelism without Ray distributed " \
                        "executor or multiprocessing executor or external " \
                        "launcher"
                _raise_or_fallback(feature_name=name,
                                   recommend_to_remove=False)
                return False

        # The platform may be supported on V1, but off by default for now.
        if not current_platform.default_v1(  # noqa: SIM103
                model_config=model_config) and _warn_or_fallback(
                    current_platform.device_name):
            return False

        if (current_platform.is_cpu()
                and model_config.get_sliding_window() is not None):
            _raise_or_fallback(feature_name="sliding window (CPU backend)",
                               recommend_to_remove=False)
            return False

        #############################################################

        return True

    def _set_default_args_v0(self, model_config: ModelConfig) -> None:
        """Set Default Arguments for V0 Engine."""

        max_model_len = model_config.max_model_len
        use_long_context = max_model_len > 32768
        if self.enable_chunked_prefill is None:
            # Chunked prefill not supported for Multimodal or MLA in V0.
            if model_config.is_multimodal_model or model_config.use_mla:
                self.enable_chunked_prefill = False

            # Enable chunked prefill by default for long context (> 32K)
            # models to avoid OOM errors in initial memory profiling phase.
            elif use_long_context:
                is_gpu = current_platform.is_cuda()
                use_sliding_window = (model_config.get_sliding_window()
                                      is not None)
                use_spec_decode = self.speculative_config is not None

                if (is_gpu and not use_sliding_window and not use_spec_decode
                        and not self.enable_lora
                        and model_config.runner_type != "pooling"):
                    self.enable_chunked_prefill = True
                    logger.warning(
                        "Chunked prefill is enabled by default for models "
                        "with max_model_len > 32K. Chunked prefill might "
                        "not work with some features or models. If you "
                        "encounter any issues, please disable by launching "
                        "with --enable-chunked-prefill=False.")

            if self.enable_chunked_prefill is None:
                self.enable_chunked_prefill = False

        if not self.enable_chunked_prefill and use_long_context:
            logger.warning(
                "The model has a long context length (%s). This may cause"
                "OOM during the initial memory profiling phase, or result "
                "in low performance due to small KV cache size. Consider "
                "setting --max-model-len to a smaller value.", max_model_len)
        elif (self.enable_chunked_prefill
              and model_config.runner_type == "pooling"):
            msg = "Chunked prefill is not supported for pooling models"
            raise ValueError(msg)

        # if using prefix caching, we must set a hash algo
        if self.enable_prefix_caching:
            # Disable prefix caching for multimodal models for VLLM_V0.
            if model_config.is_multimodal_model:
                logger.warning(
                    "--enable-prefix-caching is not supported for multimodal "
                    "models in V0 and has been disabled.")
                self.enable_prefix_caching = False

            # VLLM_V0 only supports builtin hash algo for prefix caching.
            if self.prefix_caching_hash_algo == "sha256":
                raise ValueError(
                    "sha256 is not supported for prefix caching in V0 engine. "
                    "Please use 'builtin'.")

        # Set max_num_seqs to 256 for VLLM_V0.
        if self.max_num_seqs is None:
            self.max_num_seqs = 256

    def _set_default_args_v1(self, usage_context: UsageContext,
                             model_config: ModelConfig) -> None:
        """Set Default Arguments for V1 Engine."""

        # V1 always uses chunked prefills and prefix caching
        # for non-pooling tasks.
        # For pooling tasks the default is False
        if model_config.runner_type != "pooling":
            self.enable_chunked_prefill = True
            if self.enable_prefix_caching is None:
                self.enable_prefix_caching = True
        else:

            pooling_type = model_config.pooler_config.pooling_type
            is_causal = getattr(model_config.hf_config, "is_causal", True)
            incremental_prefill_supported = (pooling_type is not None
                                             and pooling_type.lower() == "last"
                                             and is_causal)

            action = "Enabling" if \
                incremental_prefill_supported else "Disabling"

            if self.enable_chunked_prefill is None:
                self.enable_chunked_prefill = incremental_prefill_supported
                logger.info("(%s) chunked prefill by default", action)
            if self.enable_prefix_caching is None:
                self.enable_prefix_caching = incremental_prefill_supported
                logger.info("(%s) prefix caching by default", action)

        # V1 should use the new scheduler by default.
        # Swap it only if this arg is set to the original V0 default
        if self.scheduler_cls == EngineArgs.scheduler_cls:
            self.scheduler_cls = "vllm.v1.core.sched.scheduler.Scheduler"

        # When no user override, set the default values based on the usage
        # context.
        # Use different default values for different hardware.

        # Try to query the device name on the current platform. If it fails,
        # it may be because the platform that imports vLLM is not the same
        # as the platform that vLLM is running on (e.g. the case of scaling
        # vLLM with Ray) and has no GPUs. In this case we use the default
        # values for non-H100/H200 GPUs.
        try:
            device_memory = current_platform.get_device_total_memory()
            device_name = current_platform.get_device_name().lower()
        except Exception:
            # This is only used to set default_max_num_batched_tokens
            device_memory = 0

        # NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces
        # throughput, see PR #17885 for more details.
        # So here we do an extra device name check to prevent such regression.
        from vllm.usage.usage_lib import UsageContext
        if device_memory >= 70 * GiB_bytes and "a100" not in device_name:
            # For GPUs like H100 and MI300x, use larger default values.
            default_max_num_batched_tokens = {
                UsageContext.LLM_CLASS: 16384,
                UsageContext.OPENAI_API_SERVER: 8192,
            }
            default_max_num_seqs = {
                UsageContext.LLM_CLASS: 1024,
                UsageContext.OPENAI_API_SERVER: 1024,
            }
        else:
            # TODO(woosuk): Tune the default values for other hardware.
            default_max_num_batched_tokens = {
                UsageContext.LLM_CLASS: 8192,
                UsageContext.OPENAI_API_SERVER: 2048,
            }
            default_max_num_seqs = {
                UsageContext.LLM_CLASS: 256,
                UsageContext.OPENAI_API_SERVER: 256,
            }

        # tpu specific default values.
        if current_platform.is_tpu():
            default_max_num_batched_tokens_tpu = {
                UsageContext.LLM_CLASS: {
                    'V6E': 2048,
                    'V5E': 1024,
                    'V5P': 512,
                },
                UsageContext.OPENAI_API_SERVER: {
                    'V6E': 1024,
                    'V5E': 512,
                    'V5P': 256,
                }
            }

        # cpu specific default values.
        if current_platform.is_cpu():
            world_size = self.pipeline_parallel_size * self.tensor_parallel_size
            default_max_num_batched_tokens = {
                UsageContext.LLM_CLASS: 4096 * world_size,
                UsageContext.OPENAI_API_SERVER: 2048 * world_size,
            }
            default_max_num_seqs = {
                UsageContext.LLM_CLASS: 256 * world_size,
                UsageContext.OPENAI_API_SERVER: 128 * world_size,
            }

        use_context_value = usage_context.value if usage_context else None
        if (self.max_num_batched_tokens is None
                and usage_context in default_max_num_batched_tokens):
            if current_platform.is_tpu():
                chip_name = current_platform.get_device_name()
                if chip_name in default_max_num_batched_tokens_tpu[
                        usage_context]:
                    self.max_num_batched_tokens = \
                        default_max_num_batched_tokens_tpu[
                            usage_context][chip_name]
                else:
                    self.max_num_batched_tokens = \
                        default_max_num_batched_tokens[usage_context]
            else:
                if not self.enable_chunked_prefill:
                    self.max_num_batched_tokens = model_config.max_model_len
                else:
                    self.max_num_batched_tokens = \
                        default_max_num_batched_tokens[usage_context]
            logger.debug(
                "Setting max_num_batched_tokens to %d for %s usage context.",
                self.max_num_batched_tokens, use_context_value)

        if (self.max_num_seqs is None
                and usage_context in default_max_num_seqs):
            self.max_num_seqs = min(default_max_num_seqs[usage_context],
                                    self.max_num_batched_tokens or sys.maxsize)

            logger.debug("Setting max_num_seqs to %d for %s usage context.",
                         self.max_num_seqs, use_context_value)

additional_config class-attribute instance-attribute

additional_config: dict[str, Any] = get_field(
    VllmConfig, "additional_config"
)

allowed_local_media_path class-attribute instance-attribute

allowed_local_media_path: str = allowed_local_media_path

async_scheduling class-attribute instance-attribute

async_scheduling: bool = async_scheduling

block_size class-attribute instance-attribute

block_size: Optional[BlockSize] = block_size

calculate_kv_scales class-attribute instance-attribute

calculate_kv_scales: bool = calculate_kv_scales

code_revision class-attribute instance-attribute

code_revision: Optional[str] = code_revision

collect_detailed_traces class-attribute instance-attribute

compilation_config class-attribute instance-attribute

compilation_config: CompilationConfig = get_field(
    VllmConfig, "compilation_config"
)

config_format class-attribute instance-attribute

config_format: str = config_format

convert class-attribute instance-attribute

cpu_offload_gb class-attribute instance-attribute

cpu_offload_gb: float = cpu_offload_gb

cuda_graph_sizes class-attribute instance-attribute

cuda_graph_sizes: list[int] = get_field(
    SchedulerConfig, "cuda_graph_sizes"
)

data_parallel_address class-attribute instance-attribute

data_parallel_address: Optional[str] = None

data_parallel_backend class-attribute instance-attribute

data_parallel_backend: str = data_parallel_backend

data_parallel_hybrid_lb class-attribute instance-attribute

data_parallel_hybrid_lb: bool = False

data_parallel_rank class-attribute instance-attribute

data_parallel_rank: Optional[int] = None

data_parallel_rpc_port class-attribute instance-attribute

data_parallel_rpc_port: Optional[int] = None

data_parallel_size class-attribute instance-attribute

data_parallel_size: int = data_parallel_size

data_parallel_size_local class-attribute instance-attribute

data_parallel_size_local: Optional[int] = None

data_parallel_start_rank class-attribute instance-attribute

data_parallel_start_rank: Optional[int] = None

default_mm_loras class-attribute instance-attribute

default_mm_loras: Optional[Dict[str, str]] = (
    default_mm_loras
)

disable_async_output_proc class-attribute instance-attribute

disable_async_output_proc: bool = not use_async_output_proc

disable_cascade_attn class-attribute instance-attribute

disable_cascade_attn: bool = disable_cascade_attn

disable_chunked_mm_input class-attribute instance-attribute

disable_chunked_mm_input: bool = disable_chunked_mm_input

disable_custom_all_reduce class-attribute instance-attribute

disable_custom_all_reduce: bool = disable_custom_all_reduce

disable_hybrid_kv_cache_manager class-attribute instance-attribute

disable_hybrid_kv_cache_manager: bool = (
    disable_hybrid_kv_cache_manager
)

disable_log_stats class-attribute instance-attribute

disable_log_stats: bool = False

disable_mm_preprocessor_cache class-attribute instance-attribute

disable_mm_preprocessor_cache: bool = False

disable_sliding_window class-attribute instance-attribute

disable_sliding_window: bool = disable_sliding_window

distributed_executor_backend class-attribute instance-attribute

download_dir class-attribute instance-attribute

download_dir: Optional[str] = download_dir

dtype class-attribute instance-attribute

dtype: ModelDType = dtype

enable_chunked_prefill class-attribute instance-attribute

enable_chunked_prefill: Optional[bool] = (
    enable_chunked_prefill
)

enable_eplb class-attribute instance-attribute

enable_eplb: bool = enable_eplb

enable_expert_parallel class-attribute instance-attribute

enable_expert_parallel: bool = enable_expert_parallel

enable_lora class-attribute instance-attribute

enable_lora: bool = False

enable_lora_bias class-attribute instance-attribute

enable_lora_bias: bool = bias_enabled

enable_multimodal_encoder_data_parallel class-attribute instance-attribute

enable_multimodal_encoder_data_parallel: bool = False

enable_prefix_caching class-attribute instance-attribute

enable_prefix_caching: Optional[bool] = (
    enable_prefix_caching
)

enable_prompt_embeds class-attribute instance-attribute

enable_prompt_embeds: bool = enable_prompt_embeds

enable_sleep_mode class-attribute instance-attribute

enable_sleep_mode: bool = enable_sleep_mode

enforce_eager class-attribute instance-attribute

enforce_eager: bool = enforce_eager

eplb_config class-attribute instance-attribute

eplb_config: EPLBConfig = get_field(
    ParallelConfig, "eplb_config"
)

eplb_log_balancedness class-attribute instance-attribute

eplb_log_balancedness: bool = log_balancedness

eplb_step_interval class-attribute instance-attribute

eplb_step_interval: int = step_interval

eplb_window_size class-attribute instance-attribute

eplb_window_size: int = window_size

fully_sharded_loras class-attribute instance-attribute

fully_sharded_loras: bool = fully_sharded_loras

generation_config class-attribute instance-attribute

generation_config: str = generation_config

gpu_memory_utilization class-attribute instance-attribute

gpu_memory_utilization: float = gpu_memory_utilization

guided_decoding_backend class-attribute instance-attribute

guided_decoding_backend: GuidedDecodingBackend = backend

guided_decoding_disable_additional_properties class-attribute instance-attribute

guided_decoding_disable_additional_properties: bool = (
    disable_additional_properties
)

guided_decoding_disable_any_whitespace class-attribute instance-attribute

guided_decoding_disable_any_whitespace: bool = (
    disable_any_whitespace
)

guided_decoding_disable_fallback class-attribute instance-attribute

guided_decoding_disable_fallback: bool = disable_fallback

hf_config_path class-attribute instance-attribute

hf_config_path: Optional[str] = hf_config_path

hf_overrides class-attribute instance-attribute

hf_overrides: HfOverrides = get_field(
    ModelConfig, "hf_overrides"
)

hf_token class-attribute instance-attribute

hf_token: Optional[Union[bool, str]] = hf_token

ignore_patterns class-attribute instance-attribute

ignore_patterns: Optional[Union[str, List[str]]] = (
    ignore_patterns
)

interleave_mm_strings class-attribute instance-attribute

interleave_mm_strings: bool = interleave_mm_strings

kv_cache_dtype class-attribute instance-attribute

kv_cache_dtype: CacheDType = cache_dtype

kv_events_config class-attribute instance-attribute

kv_events_config: Optional[KVEventsConfig] = None

kv_sharing_fast_prefill class-attribute instance-attribute

kv_sharing_fast_prefill: bool = kv_sharing_fast_prefill

kv_transfer_config class-attribute instance-attribute

kv_transfer_config: Optional[KVTransferConfig] = None

limit_mm_per_prompt class-attribute instance-attribute

limit_mm_per_prompt: dict[str, int] = get_field(
    MultiModalConfig, "limit_per_prompt"
)

load_format class-attribute instance-attribute

load_format: Union[str, LoadFormats] = load_format

logits_processor_pattern class-attribute instance-attribute

logits_processor_pattern: Optional[str] = (
    logits_processor_pattern
)

logits_processors class-attribute instance-attribute

Custom logitproc types

logprobs_mode class-attribute instance-attribute

logprobs_mode: LogprobsMode = logprobs_mode

long_prefill_token_threshold class-attribute instance-attribute

long_prefill_token_threshold: int = (
    long_prefill_token_threshold
)

lora_dtype class-attribute instance-attribute

lora_dtype: Optional[Union[str, dtype]] = lora_dtype

lora_extra_vocab_size class-attribute instance-attribute

lora_extra_vocab_size: int = lora_extra_vocab_size

mamba_cache_dtype class-attribute instance-attribute

mamba_cache_dtype: MambaDType = mamba_cache_dtype

mamba_ssm_cache_dtype class-attribute instance-attribute

mamba_ssm_cache_dtype: MambaDType = mamba_ssm_cache_dtype

max_cpu_loras class-attribute instance-attribute

max_cpu_loras: Optional[int] = max_cpu_loras

max_logprobs class-attribute instance-attribute

max_logprobs: int = max_logprobs

max_long_partial_prefills class-attribute instance-attribute

max_long_partial_prefills: int = max_long_partial_prefills

max_lora_rank class-attribute instance-attribute

max_lora_rank: int = max_lora_rank

max_loras class-attribute instance-attribute

max_loras: int = max_loras

max_model_len class-attribute instance-attribute

max_model_len: Optional[int] = max_model_len

max_num_batched_tokens class-attribute instance-attribute

max_num_batched_tokens: Optional[int] = (
    max_num_batched_tokens
)

max_num_partial_prefills class-attribute instance-attribute

max_num_partial_prefills: int = max_num_partial_prefills

max_num_seqs class-attribute instance-attribute

max_num_seqs: Optional[int] = max_num_seqs

max_parallel_loading_workers class-attribute instance-attribute

max_parallel_loading_workers: Optional[int] = (
    max_parallel_loading_workers
)

max_seq_len_to_capture class-attribute instance-attribute

max_seq_len_to_capture: int = max_seq_len_to_capture

media_io_kwargs class-attribute instance-attribute

media_io_kwargs: dict[str, dict[str, Any]] = get_field(
    MultiModalConfig, "media_io_kwargs"
)

mm_encoder_tp_mode class-attribute instance-attribute

mm_encoder_tp_mode: MMEncoderTPMode = mm_encoder_tp_mode

mm_processor_cache_gb class-attribute instance-attribute

mm_processor_cache_gb: int = mm_processor_cache_gb

mm_processor_kwargs class-attribute instance-attribute

mm_processor_kwargs: Optional[Dict[str, Any]] = (
    mm_processor_kwargs
)

model class-attribute instance-attribute

model: str = model

model_impl class-attribute instance-attribute

model_impl: str = model_impl

model_loader_extra_config class-attribute instance-attribute

model_loader_extra_config: dict = get_field(
    LoadConfig, "model_loader_extra_config"
)

num_gpu_blocks_override class-attribute instance-attribute

num_gpu_blocks_override: Optional[int] = (
    num_gpu_blocks_override
)

num_lookahead_slots class-attribute instance-attribute

num_lookahead_slots: int = num_lookahead_slots

num_redundant_experts class-attribute instance-attribute

num_redundant_experts: int = num_redundant_experts

otlp_traces_endpoint class-attribute instance-attribute

otlp_traces_endpoint: Optional[str] = otlp_traces_endpoint

override_attention_dtype class-attribute instance-attribute

override_attention_dtype: str = override_attention_dtype

override_generation_config class-attribute instance-attribute

override_generation_config: dict[str, Any] = get_field(
    ModelConfig, "override_generation_config"
)

override_neuron_config class-attribute instance-attribute

override_neuron_config: dict[str, Any] = get_field(
    ModelConfig, "override_neuron_config"
)

override_pooler_config class-attribute instance-attribute

override_pooler_config: Optional[
    Union[dict, PoolerConfig]
] = override_pooler_config

pipeline_parallel_size class-attribute instance-attribute

pipeline_parallel_size: int = pipeline_parallel_size

preemption_mode class-attribute instance-attribute

preemption_mode: Optional[str] = preemption_mode

prefix_caching_hash_algo class-attribute instance-attribute

prefix_caching_hash_algo: PrefixCachingHashAlgo = (
    prefix_caching_hash_algo
)

pt_load_map_location class-attribute instance-attribute

pt_load_map_location: str = pt_load_map_location

quantization class-attribute instance-attribute

ray_workers_use_nsight class-attribute instance-attribute

ray_workers_use_nsight: bool = ray_workers_use_nsight

reasoning_parser class-attribute instance-attribute

reasoning_parser: str = reasoning_backend

revision class-attribute instance-attribute

revision: Optional[str] = revision

rope_scaling class-attribute instance-attribute

rope_scaling: dict[str, Any] = get_field(
    ModelConfig, "rope_scaling"
)

rope_theta class-attribute instance-attribute

rope_theta: Optional[float] = rope_theta

runner class-attribute instance-attribute

runner: RunnerOption = runner

scheduler_cls class-attribute instance-attribute

scheduler_cls: Union[str, Type[object]] = scheduler_cls

scheduler_delay_factor class-attribute instance-attribute

scheduler_delay_factor: float = delay_factor

scheduling_policy class-attribute instance-attribute

scheduling_policy: SchedulerPolicy = policy

seed class-attribute instance-attribute

seed: Optional[int] = seed

served_model_name class-attribute instance-attribute

served_model_name: Optional[Union[str, List[str]]] = (
    served_model_name
)

show_hidden_metrics_for_version class-attribute instance-attribute

show_hidden_metrics_for_version: Optional[str] = (
    show_hidden_metrics_for_version
)

skip_mm_profiling class-attribute instance-attribute

skip_mm_profiling: bool = skip_mm_profiling

skip_tokenizer_init class-attribute instance-attribute

skip_tokenizer_init: bool = skip_tokenizer_init

speculative_config class-attribute instance-attribute

speculative_config: Optional[Dict[str, Any]] = None

swap_space class-attribute instance-attribute

swap_space: float = swap_space

task class-attribute instance-attribute

tensor_parallel_size class-attribute instance-attribute

tensor_parallel_size: int = tensor_parallel_size

tokenizer class-attribute instance-attribute

tokenizer: Optional[str] = tokenizer

tokenizer_mode class-attribute instance-attribute

tokenizer_mode: TokenizerMode = tokenizer_mode

tokenizer_revision class-attribute instance-attribute

tokenizer_revision: Optional[str] = tokenizer_revision

trust_remote_code class-attribute instance-attribute

trust_remote_code: bool = trust_remote_code

use_tqdm_on_load class-attribute instance-attribute

use_tqdm_on_load: bool = use_tqdm_on_load

worker_cls class-attribute instance-attribute

worker_cls: str = worker_cls

worker_extension_cls class-attribute instance-attribute

worker_extension_cls: str = worker_extension_cls

__init__

__init__(
    model: str = model,
    served_model_name: Optional[
        Union[str, List[str]]
    ] = served_model_name,
    tokenizer: Optional[str] = tokenizer,
    hf_config_path: Optional[str] = hf_config_path,
    runner: RunnerOption = runner,
    convert: ConvertOption = convert,
    task: Optional[TaskOption] = task,
    skip_tokenizer_init: bool = skip_tokenizer_init,
    enable_prompt_embeds: bool = enable_prompt_embeds,
    tokenizer_mode: TokenizerMode = tokenizer_mode,
    trust_remote_code: bool = trust_remote_code,
    allowed_local_media_path: str = allowed_local_media_path,
    download_dir: Optional[str] = download_dir,
    load_format: Union[str, LoadFormats] = load_format,
    config_format: str = config_format,
    dtype: ModelDType = dtype,
    kv_cache_dtype: CacheDType = cache_dtype,
    seed: Optional[int] = seed,
    max_model_len: Optional[int] = max_model_len,
    cuda_graph_sizes: list[int] = get_field(
        SchedulerConfig, "cuda_graph_sizes"
    ),
    distributed_executor_backend: Optional[
        Union[
            str,
            DistributedExecutorBackend,
            Type[ExecutorBase],
        ]
    ] = distributed_executor_backend,
    pipeline_parallel_size: int = pipeline_parallel_size,
    tensor_parallel_size: int = tensor_parallel_size,
    data_parallel_size: int = data_parallel_size,
    data_parallel_rank: Optional[int] = None,
    data_parallel_start_rank: Optional[int] = None,
    data_parallel_size_local: Optional[int] = None,
    data_parallel_address: Optional[str] = None,
    data_parallel_rpc_port: Optional[int] = None,
    data_parallel_hybrid_lb: bool = False,
    data_parallel_backend: str = data_parallel_backend,
    enable_expert_parallel: bool = enable_expert_parallel,
    eplb_config: EPLBConfig = get_field(
        ParallelConfig, "eplb_config"
    ),
    enable_eplb: bool = enable_eplb,
    num_redundant_experts: int = num_redundant_experts,
    eplb_window_size: int = window_size,
    eplb_step_interval: int = step_interval,
    eplb_log_balancedness: bool = log_balancedness,
    max_parallel_loading_workers: Optional[
        int
    ] = max_parallel_loading_workers,
    block_size: Optional[BlockSize] = block_size,
    enable_prefix_caching: Optional[
        bool
    ] = enable_prefix_caching,
    prefix_caching_hash_algo: PrefixCachingHashAlgo = prefix_caching_hash_algo,
    disable_sliding_window: bool = disable_sliding_window,
    disable_cascade_attn: bool = disable_cascade_attn,
    swap_space: float = swap_space,
    cpu_offload_gb: float = cpu_offload_gb,
    gpu_memory_utilization: float = gpu_memory_utilization,
    max_num_batched_tokens: Optional[
        int
    ] = max_num_batched_tokens,
    max_num_partial_prefills: int = max_num_partial_prefills,
    max_long_partial_prefills: int = max_long_partial_prefills,
    long_prefill_token_threshold: int = long_prefill_token_threshold,
    max_num_seqs: Optional[int] = max_num_seqs,
    max_logprobs: int = max_logprobs,
    logprobs_mode: LogprobsMode = logprobs_mode,
    disable_log_stats: bool = False,
    revision: Optional[str] = revision,
    code_revision: Optional[str] = code_revision,
    rope_scaling: dict[str, Any] = get_field(
        ModelConfig, "rope_scaling"
    ),
    rope_theta: Optional[float] = rope_theta,
    hf_token: Optional[Union[bool, str]] = hf_token,
    hf_overrides: HfOverrides = get_field(
        ModelConfig, "hf_overrides"
    ),
    tokenizer_revision: Optional[str] = tokenizer_revision,
    quantization: Optional[
        QuantizationMethods
    ] = quantization,
    enforce_eager: bool = enforce_eager,
    max_seq_len_to_capture: int = max_seq_len_to_capture,
    disable_custom_all_reduce: bool = disable_custom_all_reduce,
    limit_mm_per_prompt: dict[str, int] = get_field(
        MultiModalConfig, "limit_per_prompt"
    ),
    interleave_mm_strings: bool = interleave_mm_strings,
    media_io_kwargs: dict[str, dict[str, Any]] = get_field(
        MultiModalConfig, "media_io_kwargs"
    ),
    mm_processor_kwargs: Optional[
        Dict[str, Any]
    ] = mm_processor_kwargs,
    disable_mm_preprocessor_cache: bool = False,
    mm_processor_cache_gb: int = mm_processor_cache_gb,
    mm_encoder_tp_mode: MMEncoderTPMode = mm_encoder_tp_mode,
    skip_mm_profiling: bool = skip_mm_profiling,
    enable_lora: bool = False,
    enable_lora_bias: bool = bias_enabled,
    max_loras: int = max_loras,
    max_lora_rank: int = max_lora_rank,
    default_mm_loras: Optional[
        Dict[str, str]
    ] = default_mm_loras,
    fully_sharded_loras: bool = fully_sharded_loras,
    max_cpu_loras: Optional[int] = max_cpu_loras,
    lora_dtype: Optional[Union[str, dtype]] = lora_dtype,
    lora_extra_vocab_size: int = lora_extra_vocab_size,
    ray_workers_use_nsight: bool = ray_workers_use_nsight,
    num_gpu_blocks_override: Optional[
        int
    ] = num_gpu_blocks_override,
    num_lookahead_slots: int = num_lookahead_slots,
    model_loader_extra_config: dict = get_field(
        LoadConfig, "model_loader_extra_config"
    ),
    ignore_patterns: Optional[
        Union[str, List[str]]
    ] = ignore_patterns,
    preemption_mode: Optional[str] = preemption_mode,
    scheduler_delay_factor: float = delay_factor,
    enable_chunked_prefill: Optional[
        bool
    ] = enable_chunked_prefill,
    disable_chunked_mm_input: bool = disable_chunked_mm_input,
    disable_hybrid_kv_cache_manager: bool = disable_hybrid_kv_cache_manager,
    guided_decoding_backend: GuidedDecodingBackend = backend,
    guided_decoding_disable_fallback: bool = disable_fallback,
    guided_decoding_disable_any_whitespace: bool = disable_any_whitespace,
    guided_decoding_disable_additional_properties: bool = disable_additional_properties,
    logits_processor_pattern: Optional[
        str
    ] = logits_processor_pattern,
    speculative_config: Optional[Dict[str, Any]] = None,
    show_hidden_metrics_for_version: Optional[
        str
    ] = show_hidden_metrics_for_version,
    otlp_traces_endpoint: Optional[
        str
    ] = otlp_traces_endpoint,
    collect_detailed_traces: Optional[
        list[DetailedTraceModules]
    ] = collect_detailed_traces,
    disable_async_output_proc: bool = not use_async_output_proc,
    scheduling_policy: SchedulerPolicy = policy,
    scheduler_cls: Union[str, Type[object]] = scheduler_cls,
    override_neuron_config: dict[str, Any] = get_field(
        ModelConfig, "override_neuron_config"
    ),
    override_pooler_config: Optional[
        Union[dict, PoolerConfig]
    ] = override_pooler_config,
    compilation_config: CompilationConfig = get_field(
        VllmConfig, "compilation_config"
    ),
    worker_cls: str = worker_cls,
    worker_extension_cls: str = worker_extension_cls,
    kv_transfer_config: Optional[KVTransferConfig] = None,
    kv_events_config: Optional[KVEventsConfig] = None,
    generation_config: str = generation_config,
    enable_sleep_mode: bool = enable_sleep_mode,
    override_generation_config: dict[str, Any] = get_field(
        ModelConfig, "override_generation_config"
    ),
    model_impl: str = model_impl,
    override_attention_dtype: str = override_attention_dtype,
    calculate_kv_scales: bool = calculate_kv_scales,
    mamba_cache_dtype: MambaDType = mamba_cache_dtype,
    mamba_ssm_cache_dtype: MambaDType = mamba_ssm_cache_dtype,
    additional_config: dict[str, Any] = get_field(
        VllmConfig, "additional_config"
    ),
    reasoning_parser: str = reasoning_backend,
    use_tqdm_on_load: bool = use_tqdm_on_load,
    pt_load_map_location: str = pt_load_map_location,
    enable_multimodal_encoder_data_parallel: bool = False,
    logits_processors: Optional[
        list[Union[str, type[LogitsProcessor]]]
    ] = logits_processors,
    async_scheduling: bool = async_scheduling,
    kv_sharing_fast_prefill: bool = kv_sharing_fast_prefill,
) -> None

__post_init__

__post_init__()
Source code in vllm/engine/arg_utils.py
def __post_init__(self):
    # support `EngineArgs(compilation_config={...})`
    # without having to manually construct a
    # CompilationConfig object
    if isinstance(self.compilation_config, dict):
        self.compilation_config = CompilationConfig(
            **self.compilation_config)
    if isinstance(self.eplb_config, dict):
        self.eplb_config = EPLBConfig(**self.eplb_config)
    # Setup plugins
    from vllm.plugins import load_general_plugins
    load_general_plugins()
    # when use hf offline,replace model id to local model path
    if huggingface_hub.constants.HF_HUB_OFFLINE:
        model_id = self.model
        self.model = get_model_path(self.model, self.revision)
        logger.info(
            "HF_HUB_OFFLINE is True, replace model_id [%s] " \
            "to model_path [%s]",model_id, self.model)

_is_v1_supported_oracle

_is_v1_supported_oracle(model_config: ModelConfig) -> bool

Oracle for whether to use V0 or V1 Engine by default.

Source code in vllm/engine/arg_utils.py
def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
    """Oracle for whether to use V0 or V1 Engine by default."""

    #############################################################
    # Unsupported Feature Flags on V1.

    if self.load_format == "sharded_state":
        _raise_or_fallback(
            feature_name=f"--load_format {self.load_format}",
            recommend_to_remove=False)
        return False

    if (self.logits_processor_pattern
            != EngineArgs.logits_processor_pattern):
        _raise_or_fallback(feature_name="--logits-processor-pattern",
                           recommend_to_remove=False)
        return False

    if self.preemption_mode != SchedulerConfig.preemption_mode:
        _raise_or_fallback(feature_name="--preemption-mode",
                           recommend_to_remove=True)
        return False

    if (self.disable_async_output_proc
            != EngineArgs.disable_async_output_proc):
        _raise_or_fallback(feature_name="--disable-async-output-proc",
                           recommend_to_remove=True)
        return False

    if self.scheduler_delay_factor != SchedulerConfig.delay_factor:
        _raise_or_fallback(feature_name="--scheduler-delay-factor",
                           recommend_to_remove=True)
        return False

    # Triton v3.3 has f16 conversion regression issue on Turing and Volta,
    # which broke fp16 inference
    # see: https://github.com/triton-lang/triton/issues/6698
    if (current_platform.is_cuda()
            and not current_platform.has_device_capability(80)
            and model_config.dtype == torch.float16):
        _raise_or_fallback(
            feature_name="Compute Capability < 8.0 with FP16",
            recommend_to_remove=False)
        return False

    if self.kv_cache_dtype != "auto":
        supported = current_platform.is_kv_cache_dtype_supported(
            self.kv_cache_dtype, model_config)
        if not supported:
            _raise_or_fallback(feature_name="--kv-cache-dtype",
                               recommend_to_remove=False)
            return False

    # No text embedding inputs so far.
    if self.enable_prompt_embeds:
        _raise_or_fallback(feature_name="--enable-prompt-embeds",
                           recommend_to_remove=False)
        return False

    # No Mamba or Encoder-Decoder so far.
    if not model_config.is_v1_compatible:
        _raise_or_fallback(feature_name=model_config.architectures,
                           recommend_to_remove=False)
        return False

    # V1 mamba models are unoptimized.
    if model_config.has_inner_state and _warn_or_fallback(
            feature_name="Mamba"):
        return False

    # No Concurrent Partial Prefills so far.
    if (self.max_num_partial_prefills
            != SchedulerConfig.max_num_partial_prefills
            or self.max_long_partial_prefills
            != SchedulerConfig.max_long_partial_prefills):
        _raise_or_fallback(feature_name="Concurrent Partial Prefill",
                           recommend_to_remove=False)
        return False

    # No OTLP observability so far.
    if (self.otlp_traces_endpoint or self.collect_detailed_traces):
        _raise_or_fallback(feature_name="--otlp-traces-endpoint",
                           recommend_to_remove=False)
        return False

    # V1 supports N-gram, Medusa, and Eagle speculative decoding.
    if (self.speculative_config is not None
            and self.speculative_config.get("method") == "draft_model"):
        raise NotImplementedError(
            "Speculative decoding with draft model is not supported yet. "
            "Please consider using other speculative decoding methods "
            "such as ngram, medusa, eagle, or deepseek_mtp.")

    V1_BACKENDS = [
        "FLASH_ATTN_VLLM_V1",
        "FLASH_ATTN",
        "PALLAS",
        "PALLAS_VLLM_V1",
        "TRITON_ATTN_VLLM_V1",
        "TRITON_MLA",
        "CUTLASS_MLA",
        "FLASHMLA",
        "FLASHINFER",
        "FLASHINFER_VLLM_V1",
        "ROCM_AITER_MLA",
        "TORCH_SDPA_VLLM_V1",
        "FLEX_ATTENTION",
        "TREE_ATTN",
        "XFORMERS_VLLM_V1",
    ]
    if (envs.is_set("VLLM_ATTENTION_BACKEND")
            and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
        name = f"VLLM_ATTENTION_BACKEND={envs.VLLM_ATTENTION_BACKEND}"
        _raise_or_fallback(feature_name=name, recommend_to_remove=True)
        return False

    # Platforms must decide if they can support v1 for this model
    if not current_platform.supports_v1(model_config=model_config):
        _raise_or_fallback(
            feature_name=f"device type={current_platform.device_type}",
            recommend_to_remove=False)
        return False
    #############################################################
    # Experimental Features - allow users to opt in.

    if self.pipeline_parallel_size > 1:
        supports_pp = getattr(self.distributed_executor_backend,
                              'supports_pp', False)
        if not supports_pp and self.distributed_executor_backend not in (
                ParallelConfig.distributed_executor_backend, "ray", "mp",
                "external_launcher"):
            name = "Pipeline Parallelism without Ray distributed " \
                    "executor or multiprocessing executor or external " \
                    "launcher"
            _raise_or_fallback(feature_name=name,
                               recommend_to_remove=False)
            return False

    # The platform may be supported on V1, but off by default for now.
    if not current_platform.default_v1(  # noqa: SIM103
            model_config=model_config) and _warn_or_fallback(
                current_platform.device_name):
        return False

    if (current_platform.is_cpu()
            and model_config.get_sliding_window() is not None):
        _raise_or_fallback(feature_name="sliding window (CPU backend)",
                           recommend_to_remove=False)
        return False

    #############################################################

    return True

_set_default_args_v0

_set_default_args_v0(model_config: ModelConfig) -> None

Set Default Arguments for V0 Engine.

Source code in vllm/engine/arg_utils.py
def _set_default_args_v0(self, model_config: ModelConfig) -> None:
    """Set Default Arguments for V0 Engine."""

    max_model_len = model_config.max_model_len
    use_long_context = max_model_len > 32768
    if self.enable_chunked_prefill is None:
        # Chunked prefill not supported for Multimodal or MLA in V0.
        if model_config.is_multimodal_model or model_config.use_mla:
            self.enable_chunked_prefill = False

        # Enable chunked prefill by default for long context (> 32K)
        # models to avoid OOM errors in initial memory profiling phase.
        elif use_long_context:
            is_gpu = current_platform.is_cuda()
            use_sliding_window = (model_config.get_sliding_window()
                                  is not None)
            use_spec_decode = self.speculative_config is not None

            if (is_gpu and not use_sliding_window and not use_spec_decode
                    and not self.enable_lora
                    and model_config.runner_type != "pooling"):
                self.enable_chunked_prefill = True
                logger.warning(
                    "Chunked prefill is enabled by default for models "
                    "with max_model_len > 32K. Chunked prefill might "
                    "not work with some features or models. If you "
                    "encounter any issues, please disable by launching "
                    "with --enable-chunked-prefill=False.")

        if self.enable_chunked_prefill is None:
            self.enable_chunked_prefill = False

    if not self.enable_chunked_prefill and use_long_context:
        logger.warning(
            "The model has a long context length (%s). This may cause"
            "OOM during the initial memory profiling phase, or result "
            "in low performance due to small KV cache size. Consider "
            "setting --max-model-len to a smaller value.", max_model_len)
    elif (self.enable_chunked_prefill
          and model_config.runner_type == "pooling"):
        msg = "Chunked prefill is not supported for pooling models"
        raise ValueError(msg)

    # if using prefix caching, we must set a hash algo
    if self.enable_prefix_caching:
        # Disable prefix caching for multimodal models for VLLM_V0.
        if model_config.is_multimodal_model:
            logger.warning(
                "--enable-prefix-caching is not supported for multimodal "
                "models in V0 and has been disabled.")
            self.enable_prefix_caching = False

        # VLLM_V0 only supports builtin hash algo for prefix caching.
        if self.prefix_caching_hash_algo == "sha256":
            raise ValueError(
                "sha256 is not supported for prefix caching in V0 engine. "
                "Please use 'builtin'.")

    # Set max_num_seqs to 256 for VLLM_V0.
    if self.max_num_seqs is None:
        self.max_num_seqs = 256

_set_default_args_v1

_set_default_args_v1(
    usage_context: UsageContext, model_config: ModelConfig
) -> None

Set Default Arguments for V1 Engine.

Source code in vllm/engine/arg_utils.py
def _set_default_args_v1(self, usage_context: UsageContext,
                         model_config: ModelConfig) -> None:
    """Set Default Arguments for V1 Engine."""

    # V1 always uses chunked prefills and prefix caching
    # for non-pooling tasks.
    # For pooling tasks the default is False
    if model_config.runner_type != "pooling":
        self.enable_chunked_prefill = True
        if self.enable_prefix_caching is None:
            self.enable_prefix_caching = True
    else:

        pooling_type = model_config.pooler_config.pooling_type
        is_causal = getattr(model_config.hf_config, "is_causal", True)
        incremental_prefill_supported = (pooling_type is not None
                                         and pooling_type.lower() == "last"
                                         and is_causal)

        action = "Enabling" if \
            incremental_prefill_supported else "Disabling"

        if self.enable_chunked_prefill is None:
            self.enable_chunked_prefill = incremental_prefill_supported
            logger.info("(%s) chunked prefill by default", action)
        if self.enable_prefix_caching is None:
            self.enable_prefix_caching = incremental_prefill_supported
            logger.info("(%s) prefix caching by default", action)

    # V1 should use the new scheduler by default.
    # Swap it only if this arg is set to the original V0 default
    if self.scheduler_cls == EngineArgs.scheduler_cls:
        self.scheduler_cls = "vllm.v1.core.sched.scheduler.Scheduler"

    # When no user override, set the default values based on the usage
    # context.
    # Use different default values for different hardware.

    # Try to query the device name on the current platform. If it fails,
    # it may be because the platform that imports vLLM is not the same
    # as the platform that vLLM is running on (e.g. the case of scaling
    # vLLM with Ray) and has no GPUs. In this case we use the default
    # values for non-H100/H200 GPUs.
    try:
        device_memory = current_platform.get_device_total_memory()
        device_name = current_platform.get_device_name().lower()
    except Exception:
        # This is only used to set default_max_num_batched_tokens
        device_memory = 0

    # NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces
    # throughput, see PR #17885 for more details.
    # So here we do an extra device name check to prevent such regression.
    from vllm.usage.usage_lib import UsageContext
    if device_memory >= 70 * GiB_bytes and "a100" not in device_name:
        # For GPUs like H100 and MI300x, use larger default values.
        default_max_num_batched_tokens = {
            UsageContext.LLM_CLASS: 16384,
            UsageContext.OPENAI_API_SERVER: 8192,
        }
        default_max_num_seqs = {
            UsageContext.LLM_CLASS: 1024,
            UsageContext.OPENAI_API_SERVER: 1024,
        }
    else:
        # TODO(woosuk): Tune the default values for other hardware.
        default_max_num_batched_tokens = {
            UsageContext.LLM_CLASS: 8192,
            UsageContext.OPENAI_API_SERVER: 2048,
        }
        default_max_num_seqs = {
            UsageContext.LLM_CLASS: 256,
            UsageContext.OPENAI_API_SERVER: 256,
        }

    # tpu specific default values.
    if current_platform.is_tpu():
        default_max_num_batched_tokens_tpu = {
            UsageContext.LLM_CLASS: {
                'V6E': 2048,
                'V5E': 1024,
                'V5P': 512,
            },
            UsageContext.OPENAI_API_SERVER: {
                'V6E': 1024,
                'V5E': 512,
                'V5P': 256,
            }
        }

    # cpu specific default values.
    if current_platform.is_cpu():
        world_size = self.pipeline_parallel_size * self.tensor_parallel_size
        default_max_num_batched_tokens = {
            UsageContext.LLM_CLASS: 4096 * world_size,
            UsageContext.OPENAI_API_SERVER: 2048 * world_size,
        }
        default_max_num_seqs = {
            UsageContext.LLM_CLASS: 256 * world_size,
            UsageContext.OPENAI_API_SERVER: 128 * world_size,
        }

    use_context_value = usage_context.value if usage_context else None
    if (self.max_num_batched_tokens is None
            and usage_context in default_max_num_batched_tokens):
        if current_platform.is_tpu():
            chip_name = current_platform.get_device_name()
            if chip_name in default_max_num_batched_tokens_tpu[
                    usage_context]:
                self.max_num_batched_tokens = \
                    default_max_num_batched_tokens_tpu[
                        usage_context][chip_name]
            else:
                self.max_num_batched_tokens = \
                    default_max_num_batched_tokens[usage_context]
        else:
            if not self.enable_chunked_prefill:
                self.max_num_batched_tokens = model_config.max_model_len
            else:
                self.max_num_batched_tokens = \
                    default_max_num_batched_tokens[usage_context]
        logger.debug(
            "Setting max_num_batched_tokens to %d for %s usage context.",
            self.max_num_batched_tokens, use_context_value)

    if (self.max_num_seqs is None
            and usage_context in default_max_num_seqs):
        self.max_num_seqs = min(default_max_num_seqs[usage_context],
                                self.max_num_batched_tokens or sys.maxsize)

        logger.debug("Setting max_num_seqs to %d for %s usage context.",
                     self.max_num_seqs, use_context_value)

add_cli_args staticmethod

add_cli_args(
    parser: FlexibleArgumentParser,
) -> FlexibleArgumentParser

Shared CLI arguments for vLLM engine.

Source code in vllm/engine/arg_utils.py
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
@staticmethod
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
    """Shared CLI arguments for vLLM engine."""

    # Model arguments
    model_kwargs = get_kwargs(ModelConfig)
    model_group = parser.add_argument_group(
        title="ModelConfig",
        description=ModelConfig.__doc__,
    )
    if not ('serve' in sys.argv[1:] and '--help' in sys.argv[1:]):
        model_group.add_argument("--model", **model_kwargs["model"])
    model_group.add_argument("--runner", **model_kwargs["runner"])
    model_group.add_argument("--convert", **model_kwargs["convert"])
    model_group.add_argument("--task",
                             **model_kwargs["task"],
                             deprecated=True)
    model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
    model_group.add_argument("--tokenizer-mode",
                             **model_kwargs["tokenizer_mode"])
    model_group.add_argument("--trust-remote-code",
                             **model_kwargs["trust_remote_code"])
    model_group.add_argument("--dtype", **model_kwargs["dtype"])
    model_group.add_argument("--seed", **model_kwargs["seed"])
    model_group.add_argument("--hf-config-path",
                             **model_kwargs["hf_config_path"])
    model_group.add_argument("--allowed-local-media-path",
                             **model_kwargs["allowed_local_media_path"])
    model_group.add_argument("--revision", **model_kwargs["revision"])
    model_group.add_argument("--code-revision",
                             **model_kwargs["code_revision"])
    model_group.add_argument("--rope-scaling",
                             **model_kwargs["rope_scaling"])
    model_group.add_argument("--rope-theta", **model_kwargs["rope_theta"])
    model_group.add_argument("--tokenizer-revision",
                             **model_kwargs["tokenizer_revision"])
    model_group.add_argument("--max-model-len",
                             **model_kwargs["max_model_len"])
    model_group.add_argument("--quantization", "-q",
                             **model_kwargs["quantization"])
    model_group.add_argument("--enforce-eager",
                             **model_kwargs["enforce_eager"])
    model_group.add_argument("--max-seq-len-to-capture",
                             **model_kwargs["max_seq_len_to_capture"])
    model_group.add_argument("--max-logprobs",
                             **model_kwargs["max_logprobs"])
    model_group.add_argument("--logprobs-mode",
                             choices=[f.value for f in LogprobsMode],
                             **model_kwargs["logprobs_mode"])
    model_group.add_argument("--disable-sliding-window",
                             **model_kwargs["disable_sliding_window"])
    model_group.add_argument("--disable-cascade-attn",
                             **model_kwargs["disable_cascade_attn"])
    model_group.add_argument("--skip-tokenizer-init",
                             **model_kwargs["skip_tokenizer_init"])
    model_group.add_argument("--enable-prompt-embeds",
                             **model_kwargs["enable_prompt_embeds"])
    model_group.add_argument("--served-model-name",
                             **model_kwargs["served_model_name"])
    # This one is a special case because it is the
    # opposite of ModelConfig.use_async_output_proc
    model_group.add_argument(
        "--disable-async-output-proc",
        action="store_true",
        default=EngineArgs.disable_async_output_proc,
        help="Disable async output processing. This may result in "
        "lower performance.")
    model_group.add_argument("--config-format",
                             choices=[f.value for f in ConfigFormat],
                             **model_kwargs["config_format"])
    # This one is a special case because it can bool
    # or str. TODO: Handle this in get_kwargs
    model_group.add_argument("--hf-token",
                             type=str,
                             nargs="?",
                             const=True,
                             default=model_kwargs["hf_token"]["default"],
                             help=model_kwargs["hf_token"]["help"])
    model_group.add_argument("--hf-overrides",
                             **model_kwargs["hf_overrides"])
    model_group.add_argument("--override-neuron-config",
                             **model_kwargs["override_neuron_config"])
    model_group.add_argument("--override-pooler-config",
                             **model_kwargs["override_pooler_config"])
    model_group.add_argument("--logits-processor-pattern",
                             **model_kwargs["logits_processor_pattern"])
    model_group.add_argument("--generation-config",
                             **model_kwargs["generation_config"])
    model_group.add_argument("--override-generation-config",
                             **model_kwargs["override_generation_config"])
    model_group.add_argument("--enable-sleep-mode",
                             **model_kwargs["enable_sleep_mode"])
    model_group.add_argument("--model-impl",
                             choices=[f.value for f in ModelImpl],
                             **model_kwargs["model_impl"])
    model_group.add_argument("--override-attention-dtype",
                             **model_kwargs["override_attention_dtype"])
    model_group.add_argument("--logits-processors",
                             **model_kwargs["logits_processors"])

    # Model loading arguments
    load_kwargs = get_kwargs(LoadConfig)
    load_group = parser.add_argument_group(
        title="LoadConfig",
        description=LoadConfig.__doc__,
    )
    load_group.add_argument("--load-format", **load_kwargs["load_format"])
    load_group.add_argument("--download-dir",
                            **load_kwargs["download_dir"])
    load_group.add_argument("--model-loader-extra-config",
                            **load_kwargs["model_loader_extra_config"])
    load_group.add_argument("--ignore-patterns",
                            **load_kwargs["ignore_patterns"])
    load_group.add_argument("--use-tqdm-on-load",
                            **load_kwargs["use_tqdm_on_load"])
    load_group.add_argument('--pt-load-map-location',
                            **load_kwargs["pt_load_map_location"])

    # Guided decoding arguments
    guided_decoding_kwargs = get_kwargs(DecodingConfig)
    guided_decoding_group = parser.add_argument_group(
        title="DecodingConfig",
        description=DecodingConfig.__doc__,
    )
    guided_decoding_group.add_argument("--guided-decoding-backend",
                                       **guided_decoding_kwargs["backend"])
    guided_decoding_group.add_argument(
        "--guided-decoding-disable-fallback",
        **guided_decoding_kwargs["disable_fallback"])
    guided_decoding_group.add_argument(
        "--guided-decoding-disable-any-whitespace",
        **guided_decoding_kwargs["disable_any_whitespace"])
    guided_decoding_group.add_argument(
        "--guided-decoding-disable-additional-properties",
        **guided_decoding_kwargs["disable_additional_properties"])
    guided_decoding_group.add_argument(
        "--reasoning-parser",
        # This choice is a special case because it's not static
        choices=list(ReasoningParserManager.reasoning_parsers),
        **guided_decoding_kwargs["reasoning_backend"])

    # Parallel arguments
    parallel_kwargs = get_kwargs(ParallelConfig)
    parallel_group = parser.add_argument_group(
        title="ParallelConfig",
        description=ParallelConfig.__doc__,
    )
    parallel_group.add_argument(
        "--distributed-executor-backend",
        **parallel_kwargs["distributed_executor_backend"])
    parallel_group.add_argument(
        "--pipeline-parallel-size", "-pp",
        **parallel_kwargs["pipeline_parallel_size"])
    parallel_group.add_argument("--tensor-parallel-size", "-tp",
                                **parallel_kwargs["tensor_parallel_size"])
    parallel_group.add_argument("--data-parallel-size", "-dp",
                                **parallel_kwargs["data_parallel_size"])
    parallel_group.add_argument(
        '--data-parallel-rank',
        '-dpn',
        type=int,
        help='Data parallel rank of this instance. '
        'When set, enables external load balancer mode.')
    parallel_group.add_argument('--data-parallel-start-rank',
                                '-dpr',
                                type=int,
                                help='Starting data parallel rank '
                                'for secondary nodes.')
    parallel_group.add_argument('--data-parallel-size-local',
                                '-dpl',
                                type=int,
                                help='Number of data parallel replicas '
                                'to run on this node.')
    parallel_group.add_argument('--data-parallel-address',
                                '-dpa',
                                type=str,
                                help='Address of data parallel cluster '
                                'head-node.')
    parallel_group.add_argument('--data-parallel-rpc-port',
                                '-dpp',
                                type=int,
                                help='Port for data parallel RPC '
                                'communication.')
    parallel_group.add_argument('--data-parallel-backend',
                                '-dpb',
                                type=str,
                                default='mp',
                                help='Backend for data parallel, either '
                                '"mp" or "ray".')
    parallel_group.add_argument(
        "--data-parallel-hybrid-lb",
        **parallel_kwargs["data_parallel_hybrid_lb"])
    parallel_group.add_argument(
        "--enable-expert-parallel",
        **parallel_kwargs["enable_expert_parallel"])
    parallel_group.add_argument("--enable-eplb",
                                **parallel_kwargs["enable_eplb"])
    parallel_group.add_argument("--eplb-config",
                                **parallel_kwargs["eplb_config"])
    parallel_group.add_argument(
        "--num-redundant-experts",
        type=int,
        help=
        "[DEPRECATED] --num-redundant-experts will be removed in v0.12.0.",
        deprecated=True)
    parallel_group.add_argument(
        "--eplb-window-size",
        type=int,
        help="[DEPRECATED] --eplb-window-size will be removed in v0.12.0.",
        deprecated=True)
    parallel_group.add_argument(
        "--eplb-step-interval",
        type=int,
        help=
        "[DEPRECATED] --eplb-step-interval will be removed in v0.12.0.",
        deprecated=True)
    parallel_group.add_argument(
        "--eplb-log-balancedness",
        action=argparse.BooleanOptionalAction,
        help=
        "[DEPRECATED] --eplb-log-balancedness will be removed in v0.12.0.",
        deprecated=True)

    parallel_group.add_argument(
        "--max-parallel-loading-workers",
        **parallel_kwargs["max_parallel_loading_workers"])
    parallel_group.add_argument(
        "--ray-workers-use-nsight",
        **parallel_kwargs["ray_workers_use_nsight"])
    parallel_group.add_argument(
        "--disable-custom-all-reduce",
        **parallel_kwargs["disable_custom_all_reduce"])
    parallel_group.add_argument("--worker-cls",
                                **parallel_kwargs["worker_cls"])
    parallel_group.add_argument("--worker-extension-cls",
                                **parallel_kwargs["worker_extension_cls"])
    parallel_group.add_argument(
        "--enable-multimodal-encoder-data-parallel",
        action="store_true",
        deprecated=True)

    # KV cache arguments
    cache_kwargs = get_kwargs(CacheConfig)
    cache_group = parser.add_argument_group(
        title="CacheConfig",
        description=CacheConfig.__doc__,
    )
    cache_group.add_argument("--block-size", **cache_kwargs["block_size"])
    cache_group.add_argument("--gpu-memory-utilization",
                             **cache_kwargs["gpu_memory_utilization"])
    cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"])
    cache_group.add_argument("--kv-cache-dtype",
                             **cache_kwargs["cache_dtype"])
    cache_group.add_argument("--num-gpu-blocks-override",
                             **cache_kwargs["num_gpu_blocks_override"])
    cache_group.add_argument("--enable-prefix-caching",
                             **cache_kwargs["enable_prefix_caching"])
    cache_group.add_argument("--prefix-caching-hash-algo",
                             **cache_kwargs["prefix_caching_hash_algo"])
    cache_group.add_argument("--cpu-offload-gb",
                             **cache_kwargs["cpu_offload_gb"])
    cache_group.add_argument("--calculate-kv-scales",
                             **cache_kwargs["calculate_kv_scales"])
    cache_group.add_argument("--kv-sharing-fast-prefill",
                             **cache_kwargs["kv_sharing_fast_prefill"])
    cache_group.add_argument("--mamba-cache-dtype",
                             **cache_kwargs["mamba_cache_dtype"])
    cache_group.add_argument("--mamba-ssm-cache-dtype",
                             **cache_kwargs["mamba_ssm_cache_dtype"])

    # Multimodal related configs
    multimodal_kwargs = get_kwargs(MultiModalConfig)
    multimodal_group = parser.add_argument_group(
        title="MultiModalConfig",
        description=MultiModalConfig.__doc__,
    )
    multimodal_group.add_argument("--limit-mm-per-prompt",
                                  **multimodal_kwargs["limit_per_prompt"])
    multimodal_group.add_argument("--media-io-kwargs",
                                  **multimodal_kwargs["media_io_kwargs"])
    multimodal_group.add_argument(
        "--mm-processor-kwargs",
        **multimodal_kwargs["mm_processor_kwargs"])
    multimodal_group.add_argument(
        "--mm-processor-cache-gb",
        **multimodal_kwargs["mm_processor_cache_gb"])
    multimodal_group.add_argument("--disable-mm-preprocessor-cache",
                                  action="store_true",
                                  deprecated=True)
    multimodal_group.add_argument(
        "--mm-encoder-tp-mode", **multimodal_kwargs["mm_encoder_tp_mode"])
    multimodal_group.add_argument(
        "--interleave-mm-strings",
        **multimodal_kwargs["interleave_mm_strings"])
    multimodal_group.add_argument("--skip-mm-profiling",
                                  **multimodal_kwargs["skip_mm_profiling"])

    # LoRA related configs
    lora_kwargs = get_kwargs(LoRAConfig)
    lora_group = parser.add_argument_group(
        title="LoRAConfig",
        description=LoRAConfig.__doc__,
    )
    lora_group.add_argument(
        "--enable-lora",
        action=argparse.BooleanOptionalAction,
        help="If True, enable handling of LoRA adapters.")
    lora_group.add_argument("--enable-lora-bias",
                            **lora_kwargs["bias_enabled"])
    lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"])
    lora_group.add_argument("--max-lora-rank",
                            **lora_kwargs["max_lora_rank"])
    lora_group.add_argument("--lora-extra-vocab-size",
                            **lora_kwargs["lora_extra_vocab_size"])
    lora_group.add_argument(
        "--lora-dtype",
        **lora_kwargs["lora_dtype"],
    )
    lora_group.add_argument("--max-cpu-loras",
                            **lora_kwargs["max_cpu_loras"])
    lora_group.add_argument("--fully-sharded-loras",
                            **lora_kwargs["fully_sharded_loras"])
    lora_group.add_argument("--default-mm-loras",
                            **lora_kwargs["default_mm_loras"])

    # Observability arguments
    observability_kwargs = get_kwargs(ObservabilityConfig)
    observability_group = parser.add_argument_group(
        title="ObservabilityConfig",
        description=ObservabilityConfig.__doc__,
    )
    observability_group.add_argument(
        "--show-hidden-metrics-for-version",
        **observability_kwargs["show_hidden_metrics_for_version"])
    observability_group.add_argument(
        "--otlp-traces-endpoint",
        **observability_kwargs["otlp_traces_endpoint"])
    # TODO: generalise this special case
    choices = observability_kwargs["collect_detailed_traces"]["choices"]
    metavar = f"{{{','.join(choices)}}}"
    observability_kwargs["collect_detailed_traces"]["metavar"] = metavar
    observability_kwargs["collect_detailed_traces"]["choices"] += [
        ",".join(p)
        for p in permutations(get_args(DetailedTraceModules), r=2)
    ]
    observability_group.add_argument(
        "--collect-detailed-traces",
        **observability_kwargs["collect_detailed_traces"])

    # Scheduler arguments
    scheduler_kwargs = get_kwargs(SchedulerConfig)
    scheduler_group = parser.add_argument_group(
        title="SchedulerConfig",
        description=SchedulerConfig.__doc__,
    )
    scheduler_group.add_argument(
        "--max-num-batched-tokens",
        **scheduler_kwargs["max_num_batched_tokens"])
    scheduler_group.add_argument("--max-num-seqs",
                                 **scheduler_kwargs["max_num_seqs"])
    scheduler_group.add_argument(
        "--max-num-partial-prefills",
        **scheduler_kwargs["max_num_partial_prefills"])
    scheduler_group.add_argument(
        "--max-long-partial-prefills",
        **scheduler_kwargs["max_long_partial_prefills"])
    scheduler_group.add_argument('--cuda-graph-sizes',
                                 **scheduler_kwargs["cuda_graph_sizes"])
    scheduler_group.add_argument(
        "--long-prefill-token-threshold",
        **scheduler_kwargs["long_prefill_token_threshold"])
    scheduler_group.add_argument("--num-lookahead-slots",
                                 **scheduler_kwargs["num_lookahead_slots"])
    scheduler_group.add_argument("--scheduler-delay-factor",
                                 **scheduler_kwargs["delay_factor"])
    scheduler_group.add_argument("--preemption-mode",
                                 **scheduler_kwargs["preemption_mode"])
    # multi-step scheduling has been removed; corresponding arguments
    # are no longer supported.
    scheduler_group.add_argument("--scheduling-policy",
                                 **scheduler_kwargs["policy"])
    scheduler_group.add_argument(
        "--enable-chunked-prefill",
        **scheduler_kwargs["enable_chunked_prefill"])
    scheduler_group.add_argument(
        "--disable-chunked-mm-input",
        **scheduler_kwargs["disable_chunked_mm_input"])
    scheduler_group.add_argument("--scheduler-cls",
                                 **scheduler_kwargs["scheduler_cls"])
    scheduler_group.add_argument(
        "--disable-hybrid-kv-cache-manager",
        **scheduler_kwargs["disable_hybrid_kv_cache_manager"])
    scheduler_group.add_argument("--async-scheduling",
                                 **scheduler_kwargs["async_scheduling"])

    # vLLM arguments
    vllm_kwargs = get_kwargs(VllmConfig)
    vllm_group = parser.add_argument_group(
        title="VllmConfig",
        description=VllmConfig.__doc__,
    )
    # We construct SpeculativeConfig using fields from other configs in
    # create_engine_config. So we set the type to a JSON string here to
    # delay the Pydantic validation that comes with SpeculativeConfig.
    vllm_kwargs["speculative_config"]["type"] = optional_type(json.loads)
    vllm_group.add_argument("--speculative-config",
                            **vllm_kwargs["speculative_config"])
    vllm_group.add_argument("--kv-transfer-config",
                            **vllm_kwargs["kv_transfer_config"])
    vllm_group.add_argument('--kv-events-config',
                            **vllm_kwargs["kv_events_config"])
    vllm_group.add_argument("--compilation-config", "-O",
                            **vllm_kwargs["compilation_config"])
    vllm_group.add_argument("--additional-config",
                            **vllm_kwargs["additional_config"])

    # Other arguments
    parser.add_argument('--disable-log-stats',
                        action='store_true',
                        help='Disable logging statistics.')

    return parser

create_engine_config

create_engine_config(
    usage_context: Optional[UsageContext] = None,
    headless: bool = False,
) -> VllmConfig

Create the VllmConfig.

NOTE: for autoselection of V0 vs V1 engine, we need to create the ModelConfig first, since ModelConfig's attrs (e.g. the model arch) are needed to make the decision.

This function set VLLM_USE_V1=X if VLLM_USE_V1 is unspecified by the user.

If VLLM_USE_V1 is specified by the user but the VllmConfig is incompatible, we raise an error.

Source code in vllm/engine/arg_utils.py
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
def create_engine_config(
    self,
    usage_context: Optional[UsageContext] = None,
    headless: bool = False,
) -> VllmConfig:
    """
    Create the VllmConfig.

    NOTE: for autoselection of V0 vs V1 engine, we need to
    create the ModelConfig first, since ModelConfig's attrs
    (e.g. the model arch) are needed to make the decision.

    This function set VLLM_USE_V1=X if VLLM_USE_V1 is
    unspecified by the user.

    If VLLM_USE_V1 is specified by the user but the VllmConfig
    is incompatible, we raise an error.
    """
    current_platform.pre_register_and_update()

    device_config = DeviceConfig(
        device=cast(Device, current_platform.device_type))
    model_config = self.create_model_config()

    # * If VLLM_USE_V1 is unset, we enable V1 for "supported features"
    #   and fall back to V0 for experimental or unsupported features.
    # * If VLLM_USE_V1=1, we enable V1 for supported + experimental
    #   features and raise error for unsupported features.
    # * If VLLM_USE_V1=0, we disable V1.
    use_v1 = False
    try_v1 = envs.VLLM_USE_V1 or not envs.is_set("VLLM_USE_V1")
    if try_v1 and self._is_v1_supported_oracle(model_config):
        use_v1 = True

    # If user explicitly set VLLM_USE_V1, sanity check we respect it.
    if envs.is_set("VLLM_USE_V1"):
        assert use_v1 == envs.VLLM_USE_V1
    # Otherwise, set the VLLM_USE_V1 variable globally.
    else:
        envs.set_vllm_use_v1(use_v1)

    # Set default arguments for V0 or V1 Engine.
    if use_v1:
        self._set_default_args_v1(usage_context, model_config)
        # Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1
        if current_platform.is_cpu(
        ) and current_platform.get_cpu_architecture() in (
                CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM):
            logger.info(
                "Chunked prefill is not supported for ARM and POWER "
                "and S390X CPUs; "
                "disabling it for V1 backend.")
            self.enable_chunked_prefill = False
    else:
        self._set_default_args_v0(model_config)
    assert self.enable_chunked_prefill is not None

    if envs.VLLM_ATTENTION_BACKEND in [STR_DUAL_CHUNK_FLASH_ATTN_VAL]:
        assert self.enforce_eager, (
            "Cuda graph is not supported with DualChunkFlashAttention. "
            "To run the model in eager mode, set 'enforce_eager=True' "
            "or use '--enforce-eager' in the CLI.")
        assert current_platform.is_cuda(), (
            "DualChunkFlashAttention is only supported on CUDA platform.")
        assert not use_v1, (
            "DualChunkFlashAttention is not supported on V1 engine. "
            "To run the model in V0 engine, try set 'VLLM_USE_V1=0'")

    sliding_window: Optional[int] = None
    if not is_interleaved(model_config.hf_text_config):
        # Only set CacheConfig.sliding_window if the model is all sliding
        # window. Otherwise CacheConfig.sliding_window will override the
        # global layers in interleaved sliding window models.
        sliding_window = model_config.get_sliding_window()

    cache_config = CacheConfig(
        block_size=self.block_size,
        gpu_memory_utilization=self.gpu_memory_utilization,
        swap_space=self.swap_space,
        cache_dtype=self.kv_cache_dtype,
        is_attention_free=model_config.is_attention_free,
        num_gpu_blocks_override=self.num_gpu_blocks_override,
        sliding_window=sliding_window,
        enable_prefix_caching=self.enable_prefix_caching,
        prefix_caching_hash_algo=self.prefix_caching_hash_algo,
        cpu_offload_gb=self.cpu_offload_gb,
        calculate_kv_scales=self.calculate_kv_scales,
        kv_sharing_fast_prefill=self.kv_sharing_fast_prefill,
        mamba_cache_dtype=self.mamba_cache_dtype,
        mamba_ssm_cache_dtype=self.mamba_ssm_cache_dtype,
    )

    ray_runtime_env = None
    if is_ray_initialized():
        # Ray Serve LLM calls `create_engine_config` in the context
        # of a Ray task, therefore we check is_ray_initialized()
        # as opposed to is_in_ray_actor().
        import ray
        ray_runtime_env = ray.get_runtime_context().runtime_env
        logger.info("Using ray runtime env: %s", ray_runtime_env)

    # Get the current placement group if Ray is initialized and
    # we are in a Ray actor. If so, then the placement group will be
    # passed to spawned processes.
    placement_group = None
    if is_in_ray_actor():
        import ray

        # This call initializes Ray automatically if it is not initialized,
        # but we should not do this here.
        placement_group = ray.util.get_current_placement_group()

    assert not headless or not self.data_parallel_hybrid_lb, (
        "data_parallel_hybrid_lb is not applicable in "
        "headless mode")

    data_parallel_external_lb = self.data_parallel_rank is not None
    # Local DP rank = 1, use pure-external LB.
    if data_parallel_external_lb:
        assert self.data_parallel_size_local in (1, None), (
            "data_parallel_size_local must be 1 when data_parallel_rank "
            "is set")
        data_parallel_size_local = 1
        # Use full external lb if we have local_size of 1.
        self.data_parallel_hybrid_lb = False
    elif self.data_parallel_size_local is not None:
        data_parallel_size_local = self.data_parallel_size_local

        if self.data_parallel_start_rank and not headless:
            # Infer hybrid LB mode.
            self.data_parallel_hybrid_lb = True

        if self.data_parallel_hybrid_lb and data_parallel_size_local == 1:
            # Use full external lb if we have local_size of 1.
            data_parallel_external_lb = True
            self.data_parallel_hybrid_lb = False

        if data_parallel_size_local == self.data_parallel_size:
            # Disable hybrid LB mode if set for a single node
            self.data_parallel_hybrid_lb = False

        self.data_parallel_rank = self.data_parallel_start_rank or 0
    else:
        assert not self.data_parallel_hybrid_lb, (
            "data_parallel_size_local must be set to use "
            "data_parallel_hybrid_lb.")

        # Local DP size defaults to global DP size if not set.
        data_parallel_size_local = self.data_parallel_size

    # DP address, used in multi-node case for torch distributed group
    # and ZMQ sockets.
    if self.data_parallel_address is None:
        if self.data_parallel_backend == "ray":
            host_ip = get_ip()
            logger.info(
                "Using host IP %s as ray-based data parallel address",
                host_ip)
            data_parallel_address = host_ip
        else:
            assert self.data_parallel_backend == "mp", (
                "data_parallel_backend can only be ray or mp, got %s",
                self.data_parallel_backend)
            data_parallel_address = ParallelConfig.data_parallel_master_ip
    else:
        data_parallel_address = self.data_parallel_address

    # This port is only used when there are remote data parallel engines,
    # otherwise the local IPC transport is used.
    data_parallel_rpc_port = self.data_parallel_rpc_port if (
        self.data_parallel_rpc_port
        is not None) else ParallelConfig.data_parallel_rpc_port

    if self.async_scheduling:
        # Async scheduling does not work with the uniprocess backend.
        if self.distributed_executor_backend is None:
            self.distributed_executor_backend = "mp"
            logger.info("Using mp-based distributed executor backend "
                        "for async scheduling.")
        if self.distributed_executor_backend == "uni":
            raise ValueError("Async scheduling is not supported with "
                             "uni-process backend.")
        if self.pipeline_parallel_size > 1:
            raise ValueError("Async scheduling is not supported with "
                             "pipeline-parallel-size > 1.")

        # Currently, async scheduling does not support speculative decoding.
        # TODO(woosuk): Support it.
        if self.speculative_config is not None:
            raise ValueError(
                "Currently, speculative decoding is not supported with "
                "async scheduling.")

    # Forward the deprecated CLI args to the EPLB config.
    if self.num_redundant_experts is not None:
        self.eplb_config.num_redundant_experts = self.num_redundant_experts
    if self.eplb_window_size is not None:
        self.eplb_config.window_size = self.eplb_window_size
    if self.eplb_step_interval is not None:
        self.eplb_config.step_interval = self.eplb_step_interval
    if self.eplb_log_balancedness is not None:
        self.eplb_config.log_balancedness = self.eplb_log_balancedness

    parallel_config = ParallelConfig(
        pipeline_parallel_size=self.pipeline_parallel_size,
        tensor_parallel_size=self.tensor_parallel_size,
        data_parallel_size=self.data_parallel_size,
        data_parallel_rank=self.data_parallel_rank or 0,
        data_parallel_external_lb=data_parallel_external_lb,
        data_parallel_size_local=data_parallel_size_local,
        data_parallel_master_ip=data_parallel_address,
        data_parallel_rpc_port=data_parallel_rpc_port,
        data_parallel_backend=self.data_parallel_backend,
        data_parallel_hybrid_lb=self.data_parallel_hybrid_lb,
        enable_expert_parallel=self.enable_expert_parallel,
        enable_eplb=self.enable_eplb,
        eplb_config=self.eplb_config,
        max_parallel_loading_workers=self.max_parallel_loading_workers,
        disable_custom_all_reduce=self.disable_custom_all_reduce,
        ray_workers_use_nsight=self.ray_workers_use_nsight,
        ray_runtime_env=ray_runtime_env,
        placement_group=placement_group,
        distributed_executor_backend=self.distributed_executor_backend,
        worker_cls=self.worker_cls,
        worker_extension_cls=self.worker_extension_cls,
    )

    if model_config.is_multimodal_model:
        dp_supports_mm_processor_cache = (self.data_parallel_size == 1
                                          or data_parallel_external_lb)
        if (not dp_supports_mm_processor_cache
                and model_config.mm_processor_cache_gb > 0):
            logger.warning(
                "Multi-modal processor cache is disabled because "
                "it is not compatible with data parallelism when "
                "there does not exist a one-to-one correspondance "
                "between API and engine core processes.")
            model_config.set_mm_processor_cache_gb(0)

    speculative_config = self.create_speculative_config(
        target_model_config=model_config,
        target_parallel_config=parallel_config,
        enable_chunked_prefill=self.enable_chunked_prefill,
        disable_log_stats=self.disable_log_stats,
    )

    # make sure num_lookahead_slots is set appropriately depending on
    # whether speculative decoding is enabled
    num_lookahead_slots = self.num_lookahead_slots
    if speculative_config is not None:
        num_lookahead_slots = speculative_config.num_lookahead_slots

    scheduler_config = SchedulerConfig(
        runner_type=model_config.runner_type,
        max_num_batched_tokens=self.max_num_batched_tokens,
        max_num_seqs=self.max_num_seqs,
        max_model_len=model_config.max_model_len,
        cuda_graph_sizes=self.cuda_graph_sizes,
        num_lookahead_slots=num_lookahead_slots,
        delay_factor=self.scheduler_delay_factor,
        enable_chunked_prefill=self.enable_chunked_prefill,
        disable_chunked_mm_input=self.disable_chunked_mm_input,
        is_multimodal_model=model_config.is_multimodal_model,
        preemption_mode=self.preemption_mode,
        send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
                         and parallel_config.use_ray),
        policy=self.scheduling_policy,
        scheduler_cls=self.scheduler_cls,
        max_num_partial_prefills=self.max_num_partial_prefills,
        max_long_partial_prefills=self.max_long_partial_prefills,
        long_prefill_token_threshold=self.long_prefill_token_threshold,
        disable_hybrid_kv_cache_manager=self.
        disable_hybrid_kv_cache_manager,
        async_scheduling=self.async_scheduling,
    )

    if not model_config.is_multimodal_model and self.default_mm_loras:
        raise ValueError(
            "Default modality-specific LoRA(s) were provided for a "
            "non multimodal model")

    lora_config = LoRAConfig(
        bias_enabled=self.enable_lora_bias,
        max_lora_rank=self.max_lora_rank,
        max_loras=self.max_loras,
        default_mm_loras=self.default_mm_loras,
        fully_sharded_loras=self.fully_sharded_loras,
        lora_extra_vocab_size=self.lora_extra_vocab_size,
        lora_dtype=self.lora_dtype,
        max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
        and self.max_cpu_loras > 0 else None) if self.enable_lora else None

    # bitsandbytes pre-quantized model need a specific model loader
    if model_config.quantization == "bitsandbytes":
        self.quantization = self.load_format = "bitsandbytes"

    load_config = self.create_load_config()

    decoding_config = DecodingConfig(
        backend=self.guided_decoding_backend,
        disable_fallback=self.guided_decoding_disable_fallback,
        disable_any_whitespace=self.guided_decoding_disable_any_whitespace,
        disable_additional_properties=\
            self.guided_decoding_disable_additional_properties,
        reasoning_backend=self.reasoning_parser
    )

    observability_config = ObservabilityConfig(
        show_hidden_metrics_for_version=(
            self.show_hidden_metrics_for_version),
        otlp_traces_endpoint=self.otlp_traces_endpoint,
        collect_detailed_traces=self.collect_detailed_traces,
    )

    config = VllmConfig(
        model_config=model_config,
        cache_config=cache_config,
        parallel_config=parallel_config,
        scheduler_config=scheduler_config,
        device_config=device_config,
        lora_config=lora_config,
        speculative_config=speculative_config,
        load_config=load_config,
        decoding_config=decoding_config,
        observability_config=observability_config,
        compilation_config=self.compilation_config,
        kv_transfer_config=self.kv_transfer_config,
        kv_events_config=self.kv_events_config,
        additional_config=self.additional_config,
    )

    return config

create_load_config

create_load_config() -> LoadConfig
Source code in vllm/engine/arg_utils.py
def create_load_config(self) -> LoadConfig:

    if self.quantization == "bitsandbytes":
        self.load_format = "bitsandbytes"

    if self.load_format == "tensorizer":
        if hasattr(self.model_loader_extra_config, "to_serializable"):
            self.model_loader_extra_config = (
                self.model_loader_extra_config.to_serializable())
        self.model_loader_extra_config["tensorizer_config"] = {}
        self.model_loader_extra_config["tensorizer_config"][
            "tensorizer_dir"] = self.model
        self.validate_tensorizer_args()

    return LoadConfig(
        load_format=self.load_format,
        download_dir=self.download_dir,
        device="cpu"
        if is_online_quantization(self.quantization) else None,
        model_loader_extra_config=self.model_loader_extra_config,
        ignore_patterns=self.ignore_patterns,
        use_tqdm_on_load=self.use_tqdm_on_load,
        pt_load_map_location=self.pt_load_map_location,
    )

create_model_config

create_model_config() -> ModelConfig
Source code in vllm/engine/arg_utils.py
def create_model_config(self) -> ModelConfig:
    # gguf file needs a specific model loader and doesn't use hf_repo
    if check_gguf_file(self.model):
        self.quantization = self.load_format = "gguf"

    # NOTE: This is to allow model loading from S3 in CI
    if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
            and self.model in MODELS_ON_S3 and self.load_format == "auto"):
        self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
        self.load_format = "runai_streamer"

    if self.disable_mm_preprocessor_cache:
        logger.warning(
            "`--disable-mm-preprocessor-cache` is deprecated "
            "and will be removed in v0.13. "
            "Please use `--mm-processor-cache-gb 0` instead.", )

        self.mm_processor_cache_gb = 0
    elif envs.VLLM_MM_INPUT_CACHE_GIB != 4:
        logger.warning(
            "VLLM_MM_INPUT_CACHE_GIB` is deprecated "
            "and will be removed in v0.13. "
            "Please use `--mm-processor-cache-gb %d` instead.",
            envs.VLLM_MM_INPUT_CACHE_GIB,
        )

        self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB

    if self.enable_multimodal_encoder_data_parallel:
        logger.warning(
            "--enable-multimodal-encoder-data-parallel` is deprecated "
            "and will be removed in v0.13. "
            "Please use `--mm-encoder-tp-mode data` instead.")

        self.mm_encoder_tp_mode = "data"

    return ModelConfig(
        model=self.model,
        hf_config_path=self.hf_config_path,
        runner=self.runner,
        convert=self.convert,
        task=self.task,
        tokenizer=self.tokenizer,
        tokenizer_mode=self.tokenizer_mode,
        trust_remote_code=self.trust_remote_code,
        allowed_local_media_path=self.allowed_local_media_path,
        dtype=self.dtype,
        seed=self.seed,
        revision=self.revision,
        code_revision=self.code_revision,
        rope_scaling=self.rope_scaling,
        rope_theta=self.rope_theta,
        hf_token=self.hf_token,
        hf_overrides=self.hf_overrides,
        tokenizer_revision=self.tokenizer_revision,
        max_model_len=self.max_model_len,
        quantization=self.quantization,
        enforce_eager=self.enforce_eager,
        max_seq_len_to_capture=self.max_seq_len_to_capture,
        max_logprobs=self.max_logprobs,
        logprobs_mode=self.logprobs_mode,
        disable_sliding_window=self.disable_sliding_window,
        disable_cascade_attn=self.disable_cascade_attn,
        skip_tokenizer_init=self.skip_tokenizer_init,
        enable_prompt_embeds=self.enable_prompt_embeds,
        served_model_name=self.served_model_name,
        limit_mm_per_prompt=self.limit_mm_per_prompt,
        interleave_mm_strings=self.interleave_mm_strings,
        media_io_kwargs=self.media_io_kwargs,
        skip_mm_profiling=self.skip_mm_profiling,
        use_async_output_proc=not self.disable_async_output_proc,
        config_format=self.config_format,
        mm_processor_kwargs=self.mm_processor_kwargs,
        mm_processor_cache_gb=self.mm_processor_cache_gb,
        mm_encoder_tp_mode=self.mm_encoder_tp_mode,
        override_neuron_config=self.override_neuron_config,
        override_pooler_config=self.override_pooler_config,
        logits_processor_pattern=self.logits_processor_pattern,
        generation_config=self.generation_config,
        override_generation_config=self.override_generation_config,
        enable_sleep_mode=self.enable_sleep_mode,
        model_impl=self.model_impl,
        override_attention_dtype=self.override_attention_dtype,
        logits_processors=self.logits_processors,
    )

create_speculative_config

create_speculative_config(
    target_model_config: ModelConfig,
    target_parallel_config: ParallelConfig,
    enable_chunked_prefill: bool,
    disable_log_stats: bool,
) -> Optional[SpeculativeConfig]

Initializes and returns a SpeculativeConfig object based on speculative_config.

This function utilizes speculative_config to create a SpeculativeConfig object. The speculative_config can either be provided as a JSON string input via CLI arguments or directly as a dictionary from the engine.

Source code in vllm/engine/arg_utils.py
def create_speculative_config(
    self,
    target_model_config: ModelConfig,
    target_parallel_config: ParallelConfig,
    enable_chunked_prefill: bool,
    disable_log_stats: bool,
) -> Optional["SpeculativeConfig"]:
    """Initializes and returns a SpeculativeConfig object based on
    `speculative_config`.

    This function utilizes `speculative_config` to create a
    SpeculativeConfig object. The `speculative_config` can either be
    provided as a JSON string input via CLI arguments or directly as a
    dictionary from the engine.
    """

    from vllm.transformers_utils.config import get_config
    from vllm.transformers_utils.configs.speculators.base import (
        SpeculatorsConfig)

    if self.speculative_config is None:
        hf_config = get_config(self.hf_config_path or self.model,
                               self.trust_remote_code, self.revision,
                               self.code_revision, self.config_format)

        # if loading a SpeculatorsConfig, load the specualtive_config
        # details from the config directly
        # no user input required / expected
        if isinstance(hf_config, SpeculatorsConfig):
            # We create one since we don't create one
            self.speculative_config = {}
            self.speculative_config[
                "num_speculative_tokens"] = hf_config.num_lookahead_tokens
            self.speculative_config["model"] = self.model
            self.speculative_config["method"] = hf_config.method
        else:
            return None

    # Note(Shangming): These parameters are not obtained from the cli arg
    # '--speculative-config' and must be passed in when creating the engine
    # config.
    self.speculative_config.update({
        "target_model_config": target_model_config,
        "target_parallel_config": target_parallel_config,
        "enable_chunked_prefill": enable_chunked_prefill,
        "disable_log_stats": disable_log_stats,
    })
    return SpeculativeConfig(**self.speculative_config)

from_cli_args classmethod

from_cli_args(args: Namespace)
Source code in vllm/engine/arg_utils.py
@classmethod
def from_cli_args(cls, args: argparse.Namespace):
    # Get the list of attributes of this dataclass.
    attrs = [attr.name for attr in dataclasses.fields(cls)]
    # Set the attributes from the parsed arguments.
    engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
    return engine_args

validate_tensorizer_args

validate_tensorizer_args()
Source code in vllm/engine/arg_utils.py
def validate_tensorizer_args(self):
    from vllm.model_executor.model_loader.tensorizer import (
        TensorizerConfig)
    for key in self.model_loader_extra_config:
        if key in TensorizerConfig._fields:
            self.model_loader_extra_config["tensorizer_config"][
                key] = self.model_loader_extra_config[key]

LLM

An LLM for generating texts from given prompts and sampling parameters.

This class includes a tokenizer, a language model (possibly distributed across multiple GPUs), and GPU memory space allocated for intermediate states (aka KV cache). Given a batch of prompts and sampling parameters, this class generates texts from the model, using an intelligent batching mechanism and efficient memory management.

Parameters:

Name Type Description Default
model str

The name or path of a HuggingFace Transformers model.

required
tokenizer Optional[str]

The name or path of a HuggingFace Transformers tokenizer.

None
tokenizer_mode TokenizerMode

The tokenizer mode. "auto" will use the fast tokenizer if available, and "slow" will always use the slow tokenizer.

'auto'
skip_tokenizer_init bool

If true, skip initialization of tokenizer and detokenizer. Expect valid prompt_token_ids and None for prompt from the input.

False
trust_remote_code bool

Trust remote code (e.g., from HuggingFace) when downloading the model and tokenizer.

False
allowed_local_media_path str

Allowing API requests to read local images or videos from directories specified by the server file system. This is a security risk. Should only be enabled in trusted environments.

''
tensor_parallel_size int

The number of GPUs to use for distributed execution with tensor parallelism.

1
dtype ModelDType

The data type for the model weights and activations. Currently, we support float32, float16, and bfloat16. If auto, we use the torch_dtype attribute specified in the model config file. However, if the torch_dtype in the config is float32, we will use float16 instead.

'auto'
quantization Optional[QuantizationMethods]

The method used to quantize the model weights. Currently, we support "awq", "gptq", and "fp8" (experimental). If None, we first check the quantization_config attribute in the model config file. If that is None, we assume the model weights are not quantized and use dtype to determine the data type of the weights.

None
revision Optional[str]

The specific model version to use. It can be a branch name, a tag name, or a commit id.

None
tokenizer_revision Optional[str]

The specific tokenizer version to use. It can be a branch name, a tag name, or a commit id.

None
seed Optional[int]

The seed to initialize the random number generator for sampling.

None
gpu_memory_utilization float

The ratio (between 0 and 1) of GPU memory to reserve for the model weights, activations, and KV cache. Higher values will increase the KV cache size and thus improve the model's throughput. However, if the value is too high, it may cause out-of- memory (OOM) errors.

0.9
swap_space float

The size (GiB) of CPU memory per GPU to use as swap space. This can be used for temporarily storing the states of the requests when their best_of sampling parameters are larger than 1. If all requests will have best_of=1, you can safely set this to 0. Noting that best_of is only supported in V0. Otherwise, too small values may cause out-of-memory (OOM) errors.

4
cpu_offload_gb float

The size (GiB) of CPU memory to use for offloading the model weights. This virtually increases the GPU memory space you can use to hold the model weights, at the cost of CPU-GPU data transfer for every forward pass.

0
enforce_eager bool

Whether to enforce eager execution. If True, we will disable CUDA graph and always execute the model in eager mode. If False, we will use CUDA graph and eager execution in hybrid.

False
max_seq_len_to_capture int

Maximum sequence len covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode. Additionally for encoder-decoder models, if the sequence length of the encoder input is larger than this, we fall back to the eager mode.

8192
disable_custom_all_reduce bool False
disable_async_output_proc bool

Disable async output processing. This may result in lower performance.

False
hf_token Optional[Union[bool, str]]

The token to use as HTTP bearer authorization for remote files . If True, will use the token generated when running huggingface-cli login (stored in ~/.huggingface).

None
hf_overrides Optional[HfOverrides]

If a dictionary, contains arguments to be forwarded to the HuggingFace config. If a callable, it is called to update the HuggingFace config.

None
mm_processor_kwargs Optional[dict[str, Any]]

Arguments to be forwarded to the model's processor for multi-modal data, e.g., image processor. Overrides for the multi-modal processor obtained from AutoProcessor.from_pretrained. The available overrides depend on the model that is being run. For example, for Phi-3-Vision: {"num_crops": 4}.

None
override_pooler_config Optional[PoolerConfig]

Initialize non-default pooling config or override default pooling config for the pooling model. e.g. PoolerConfig(pooling_type="mean", normalize=False).

None
compilation_config Optional[Union[int, dict[str, Any], CompilationConfig]]

Either an integer or a dictionary. If it is an integer, it is used as the level of compilation optimization. If it is a dictionary, it can specify the full compilation configuration.

None
**kwargs

Arguments for EngineArgs.

{}
Note

This class is intended to be used for offline inference. For online serving, use the AsyncLLMEngine class instead.

Source code in vllm/entrypoints/llm.py
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
class LLM:
    """An LLM for generating texts from given prompts and sampling parameters.

    This class includes a tokenizer, a language model (possibly distributed
    across multiple GPUs), and GPU memory space allocated for intermediate
    states (aka KV cache). Given a batch of prompts and sampling parameters,
    this class generates texts from the model, using an intelligent batching
    mechanism and efficient memory management.

    Args:
        model: The name or path of a HuggingFace Transformers model.
        tokenizer: The name or path of a HuggingFace Transformers tokenizer.
        tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
            if available, and "slow" will always use the slow tokenizer.
        skip_tokenizer_init: If true, skip initialization of tokenizer and
            detokenizer. Expect valid prompt_token_ids and None for prompt
            from the input.
        trust_remote_code: Trust remote code (e.g., from HuggingFace) when
            downloading the model and tokenizer.
        allowed_local_media_path: Allowing API requests to read local images
            or videos from directories specified by the server file system.
            This is a security risk. Should only be enabled in trusted
            environments.
        tensor_parallel_size: The number of GPUs to use for distributed
            execution with tensor parallelism.
        dtype: The data type for the model weights and activations. Currently,
            we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
            the `torch_dtype` attribute specified in the model config file.
            However, if the `torch_dtype` in the config is `float32`, we will
            use `float16` instead.
        quantization: The method used to quantize the model weights. Currently,
            we support "awq", "gptq", and "fp8" (experimental).
            If None, we first check the `quantization_config` attribute in the
            model config file. If that is None, we assume the model weights are
            not quantized and use `dtype` to determine the data type of
            the weights.
        revision: The specific model version to use. It can be a branch name,
            a tag name, or a commit id.
        tokenizer_revision: The specific tokenizer version to use. It can be a
            branch name, a tag name, or a commit id.
        seed: The seed to initialize the random number generator for sampling.
        gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
            reserve for the model weights, activations, and KV cache. Higher
            values will increase the KV cache size and thus improve the model's
            throughput. However, if the value is too high, it may cause out-of-
            memory (OOM) errors.
        swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
            This can be used for temporarily storing the states of the requests
            when their `best_of` sampling parameters are larger than 1. If all
            requests will have `best_of=1`, you can safely set this to 0.
            Noting that `best_of` is only supported in V0. Otherwise, too small
            values may cause out-of-memory (OOM) errors.
        cpu_offload_gb: The size (GiB) of CPU memory to use for offloading
            the model weights. This virtually increases the GPU memory space
            you can use to hold the model weights, at the cost of CPU-GPU data
            transfer for every forward pass.
        enforce_eager: Whether to enforce eager execution. If True, we will
            disable CUDA graph and always execute the model in eager mode.
            If False, we will use CUDA graph and eager execution in hybrid.
        max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
            When a sequence has context length larger than this, we fall back
            to eager mode. Additionally for encoder-decoder models, if the
            sequence length of the encoder input is larger than this, we fall
            back to the eager mode.
        disable_custom_all_reduce: See
            [ParallelConfig][vllm.config.ParallelConfig].
        disable_async_output_proc: Disable async output processing.
            This may result in lower performance.
        hf_token: The token to use as HTTP bearer authorization for remote files
            . If `True`, will use the token generated when running
            `huggingface-cli login` (stored in `~/.huggingface`).
        hf_overrides: If a dictionary, contains arguments to be forwarded to the
            HuggingFace config. If a callable, it is called to update the
            HuggingFace config.
        mm_processor_kwargs: Arguments to be forwarded to the model's processor
            for multi-modal data, e.g., image processor. Overrides for the
            multi-modal processor obtained from `AutoProcessor.from_pretrained`.
            The available overrides depend on the model that is being run.
            For example, for Phi-3-Vision: `{"num_crops": 4}`.
        override_pooler_config: Initialize non-default pooling config or
            override default pooling config for the pooling model.
            e.g. `PoolerConfig(pooling_type="mean", normalize=False)`.
        compilation_config: Either an integer or a dictionary. If it is an
            integer, it is used as the level of compilation optimization. If it
            is a dictionary, it can specify the full compilation configuration.
        **kwargs: Arguments for [`EngineArgs`][vllm.EngineArgs].

    Note:
        This class is intended to be used for offline inference. For online
        serving, use the [AsyncLLMEngine][vllm.AsyncLLMEngine] class instead.
    """

    def __init__(
        self,
        model: str,
        *,
        runner: RunnerOption = "auto",
        convert: ConvertOption = "auto",
        tokenizer: Optional[str] = None,
        tokenizer_mode: TokenizerMode = "auto",
        skip_tokenizer_init: bool = False,
        trust_remote_code: bool = False,
        allowed_local_media_path: str = "",
        tensor_parallel_size: int = 1,
        dtype: ModelDType = "auto",
        quantization: Optional[QuantizationMethods] = None,
        revision: Optional[str] = None,
        tokenizer_revision: Optional[str] = None,
        seed: Optional[int] = None,
        gpu_memory_utilization: float = 0.9,
        swap_space: float = 4,
        cpu_offload_gb: float = 0,
        enforce_eager: bool = False,
        max_seq_len_to_capture: int = 8192,
        disable_custom_all_reduce: bool = False,
        disable_async_output_proc: bool = False,
        hf_token: Optional[Union[bool, str]] = None,
        hf_overrides: Optional[HfOverrides] = None,
        mm_processor_kwargs: Optional[dict[str, Any]] = None,
        override_pooler_config: Optional[PoolerConfig] = None,
        compilation_config: Optional[Union[int, dict[str, Any],
                                           CompilationConfig]] = None,
        logits_processors: Optional[list[Union[str,
                                               type[LogitsProcessor]]]] = None,
        **kwargs,
    ) -> None:
        """LLM constructor."""

        if "disable_log_stats" not in kwargs:
            kwargs["disable_log_stats"] = True

        if "worker_cls" in kwargs:
            worker_cls = kwargs["worker_cls"]
            # if the worker_cls is not qualified string name,
            # we serialize it using cloudpickle to avoid pickling issues
            if isinstance(worker_cls, type):
                kwargs["worker_cls"] = cloudpickle.dumps(worker_cls)

        if "kv_transfer_config" in kwargs and isinstance(
                kwargs["kv_transfer_config"], dict):
            from vllm.config import KVTransferConfig
            raw_config_dict = kwargs["kv_transfer_config"]
            try:
                kwargs["kv_transfer_config"] = KVTransferConfig(
                    **raw_config_dict)
            except ValidationError as e:
                logger.error(
                    "Failed to convert 'kv_transfer_config' dict to "
                    "KVTransferConfig object. Dict: %s. Error: %s",
                    raw_config_dict, e)
                # Consider re-raising a more specific vLLM error or ValueError
                # to provide better context to the user.
                raise ValueError(
                    f"Invalid 'kv_transfer_config' provided: {e}") from e

        if hf_overrides is None:
            hf_overrides = {}

        if compilation_config is not None:
            if isinstance(compilation_config, int):
                compilation_config_instance = CompilationConfig(
                    level=compilation_config)
            elif isinstance(compilation_config, dict):
                predicate = lambda x: is_init_field(CompilationConfig, x[0])
                compilation_config_instance = CompilationConfig(
                    **dict(filter(predicate, compilation_config.items())))
            else:
                compilation_config_instance = compilation_config
        else:
            compilation_config_instance = CompilationConfig()

        engine_args = EngineArgs(
            model=model,
            runner=runner,
            convert=convert,
            tokenizer=tokenizer,
            tokenizer_mode=tokenizer_mode,
            skip_tokenizer_init=skip_tokenizer_init,
            trust_remote_code=trust_remote_code,
            allowed_local_media_path=allowed_local_media_path,
            tensor_parallel_size=tensor_parallel_size,
            dtype=dtype,
            quantization=quantization,
            revision=revision,
            tokenizer_revision=tokenizer_revision,
            seed=seed,
            gpu_memory_utilization=gpu_memory_utilization,
            swap_space=swap_space,
            cpu_offload_gb=cpu_offload_gb,
            enforce_eager=enforce_eager,
            max_seq_len_to_capture=max_seq_len_to_capture,
            disable_custom_all_reduce=disable_custom_all_reduce,
            disable_async_output_proc=disable_async_output_proc,
            hf_token=hf_token,
            hf_overrides=hf_overrides,
            mm_processor_kwargs=mm_processor_kwargs,
            override_pooler_config=override_pooler_config,
            compilation_config=compilation_config_instance,
            logits_processors=logits_processors,
            **kwargs,
        )

        log_non_default_args(engine_args)

        # Create the Engine (autoselects V0 vs V1)
        self.llm_engine = LLMEngine.from_engine_args(
            engine_args=engine_args, usage_context=UsageContext.LLM_CLASS)
        self.engine_class = type(self.llm_engine)

        self.request_counter = Counter()
        self.default_sampling_params: Union[dict[str, Any], None] = None

        if envs.VLLM_USE_V1:
            supported_tasks = self.llm_engine \
                .get_supported_tasks()  # type: ignore
        else:
            supported_tasks = self.llm_engine.model_config.supported_tasks

        logger.info("Supported_tasks: %s", supported_tasks)

        self.supported_tasks = supported_tasks

    def get_tokenizer(
        self,
        lora_request: Optional[LoRARequest] = None,
    ) -> AnyTokenizer:
        return self.llm_engine.get_tokenizer_group().get_lora_tokenizer(
            lora_request)

    def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
        tokenizer_group = self.llm_engine.get_tokenizer_group()

        # While CachedTokenizer is dynamic, have no choice but
        # compare class name. Misjudgment will arise from
        # user-defined tokenizer started with 'Cached'
        if tokenizer.__class__.__name__.startswith("Cached"):
            tokenizer_group.tokenizer = tokenizer
        else:
            tokenizer_group.tokenizer = get_cached_tokenizer(tokenizer)

    def get_default_sampling_params(self) -> SamplingParams:
        if self.default_sampling_params is None:
            self.default_sampling_params = (
                self.llm_engine.model_config.get_diff_sampling_param())
        if self.default_sampling_params:
            return SamplingParams.from_optional(**self.default_sampling_params)
        return SamplingParams()

    def generate(
        self,
        prompts: Union[PromptType, Sequence[PromptType]],
        sampling_params: Optional[Union[SamplingParams,
                                        Sequence[SamplingParams]]] = None,
        *,
        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
        priority: Optional[list[int]] = None,
    ) -> list[RequestOutput]:
        """Generates the completions for the input prompts.

        This class automatically batches the given prompts, considering
        the memory constraint. For the best performance, put all of your prompts
        into a single list and pass it to this method.

        Args:
            prompts: The prompts to the LLM. You may pass a sequence of prompts
                for batch inference. See [PromptType][vllm.inputs.PromptType]
                for more details about the format of each prompts.
            sampling_params: The sampling parameters for text generation. If
                None, we use the default sampling parameters.
                When it is a single value, it is applied to every prompt.
                When it is a list, the list must have the same length as the
                prompts and it is paired one by one with the prompt.
            use_tqdm: If `True`, shows a tqdm progress bar.
                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
                it is used to create the progress bar.
                If `False`, no progress bar is created.
            lora_request: LoRA request to use for generation, if any.
            priority: The priority of the requests, if any.
                Only applicable when priority scheduling policy is enabled.

        Returns:
            A list of `RequestOutput` objects containing the
            generated completions in the same order as the input prompts.

        Note:
            Using `prompts` and `prompt_token_ids` as keyword parameters is
            considered legacy and may be deprecated in the future. You should
            instead pass them via the `inputs` parameter.
        """
        model_config = self.llm_engine.model_config
        runner_type = model_config.runner_type
        if runner_type != "generate":
            raise ValueError(
                "LLM.generate() is only supported for generative models. "
                "Try passing `--runner generate` to use the model as a "
                "generative model.")

        if sampling_params is None:
            # Use default sampling params.
            sampling_params = self.get_default_sampling_params()

        tokenization_kwargs: dict[str, Any] = {}
        truncate_prompt_tokens = None
        if isinstance(sampling_params, SamplingParams):
            truncate_prompt_tokens = sampling_params.truncate_prompt_tokens

        _validate_truncation_size(model_config.max_model_len,
                                  truncate_prompt_tokens, tokenization_kwargs)

        # Add any modality specific loras to the corresponding prompts
        lora_request = self._get_modality_specific_lora_reqs(
            prompts, lora_request)

        self._validate_and_add_requests(
            prompts=prompts,
            params=sampling_params,
            use_tqdm=use_tqdm,
            lora_request=lora_request,
            tokenization_kwargs=tokenization_kwargs,
            priority=priority,
        )

        outputs = self._run_engine(use_tqdm=use_tqdm)
        return self.engine_class.validate_outputs(outputs, RequestOutput)

    def _get_modality_specific_lora_reqs(
            self, prompts: Union[PromptType, Sequence[PromptType]],
            lora_request: Optional[Union[list[LoRARequest], LoRARequest]]):
        # Grab the lora config off the vllm config on the engine,
        # since this is the same for both v0 & v1.
        lora_config = self.llm_engine.vllm_config.lora_config

        # If there's no lora config / default_mm_loras, or the model
        # isn't multimodal, leave the lora as is.
        if (lora_config is None
                or not self.llm_engine.model_config.is_multimodal_model
                or (lora_config and lora_config.default_mm_loras is None)):
            return lora_request

        if not isinstance(prompts, Sequence):
            prompts = [prompts]

        optional_loras = ([lora_request] * len(prompts)
                          if not isinstance(lora_request, Sequence) else
                          lora_request)

        return [
            self._resolve_single_prompt_mm_lora(
                prompt,
                opt_lora_req,
                lora_config.default_mm_loras,
            ) for prompt, opt_lora_req in zip(prompts, optional_loras)
        ]

    def _resolve_single_prompt_mm_lora(self, prompt: PromptType,
                                       lora_request: Optional[LoRARequest],
                                       default_mm_loras: Optional[dict[str,
                                                                       str]]):
        if (not default_mm_loras or not isinstance(prompt, dict)
                or "multi_modal_data" not in prompt):
            return lora_request

        prompt = cast(Union[TextPrompt, TokensPrompt], prompt)

        intersection = set(prompt["multi_modal_data"].keys()) \
            .intersection(default_mm_loras.keys())
        if not intersection:
            return lora_request
        if len(intersection) > 1:
            # TODO: Would be nice to be able to have multiple loras per prompt
            logger.warning(
                "Multiple modality specific loras were registered and would be"
                " used by a single prompt consuming several modalities; "
                " currently we only support one lora per request; as such,"
                " lora(s) registered with modalities: %s"
                " will be skipped", intersection)
            return lora_request

        # Build the LoRA request; the ID of the default mm lora is the
        # index of the modality name sorted alphabetically + 1.
        modality_name = intersection.pop()
        modality_lora_path = default_mm_loras[modality_name]
        modality_lora_id = sorted(default_mm_loras).index(modality_name) + 1

        # If we have a collision, warn if there is a collision,
        # but always send the explicitly provided request.
        if lora_request:
            if lora_request.lora_int_id != modality_lora_id:
                logger.warning(
                    "A modality with a registered lora and a lora_request "
                    "with a different ID were provided; falling back to the "
                    "lora_request as we only apply one LoRARequest per prompt")
            return lora_request

        return LoRARequest(
            modality_name,
            modality_lora_id,
            modality_lora_path,
        )

    def collective_rpc(self,
                       method: Union[str, Callable[..., _R]],
                       timeout: Optional[float] = None,
                       args: tuple = (),
                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
        """
        Execute an RPC call on all workers.

        Args:
            method: Name of the worker method to execute, or a callable that
                is serialized and sent to all workers to execute.

                If the method is a callable, it should accept an additional
                `self` argument, in addition to the arguments passed in `args`
                and `kwargs`. The `self` argument will be the worker object.
            timeout: Maximum time in seconds to wait for execution. Raises a
                [`TimeoutError`][] on timeout. `None` means wait indefinitely.
            args: Positional arguments to pass to the worker method.
            kwargs: Keyword arguments to pass to the worker method.

        Returns:
            A list containing the results from each worker.

        Note:
            It is recommended to use this API to only pass control messages,
            and set up data-plane communication to pass data.
        """

        return self.llm_engine.collective_rpc(method, timeout, args, kwargs)

    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
        """
        Run a function directly on the model inside each worker,
        returning the result for each of them.
        """
        executor = self.llm_engine.model_executor
        return executor.apply_model(func)

    def _get_beam_search_lora_requests(
        self,
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]],
        prompts: list[Union[TokensPrompt, TextPrompt]],
    ) -> list[Optional[LoRARequest]]:
        """Get the optional lora request corresponding to each prompt."""
        if isinstance(lora_request,
                      Sequence) and len(lora_request) != len(prompts):
            raise ValueError(
                "Lora request list should be the same length as the prompts")

        if lora_request is None or isinstance(lora_request, LoRARequest):
            return [lora_request] * len(prompts)

        raise TypeError(f"Invalid lora_request type {type(lora_request)}")

    def beam_search(
        self,
        prompts: list[Union[TokensPrompt, TextPrompt]],
        params: BeamSearchParams,
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
        use_tqdm: bool = False,
    ) -> list[BeamSearchOutput]:
        """
        Generate sequences using beam search.

        Args:
            prompts: A list of prompts. Each prompt can be a string or a list
                of token IDs.
            params: The beam search parameters.
            lora_request: LoRA request to use for generation, if any.
            use_tqdm: Whether to use tqdm to display the progress bar.
        """
        # TODO: how does beam search work together with length penalty,
        # frequency, penalty, and stopping criteria, etc.?
        beam_width = params.beam_width
        max_tokens = params.max_tokens
        temperature = params.temperature
        ignore_eos = params.ignore_eos
        length_penalty = params.length_penalty

        lora_requests = self._get_beam_search_lora_requests(
            lora_request, prompts)

        tokenizer = self.get_tokenizer()
        sort_beams_key = create_sort_beams_key_function(
            tokenizer.eos_token_id,
            length_penalty,
        )

        def create_tokens_prompt_from_beam(
                beam: BeamSearchSequence) -> TokensPrompt:
            token_prompt_kwargs: TokensPrompt = {
                "prompt_token_ids": beam.tokens
            }
            if beam.multi_modal_data is not None:
                token_prompt_kwargs["multi_modal_data"] = beam.multi_modal_data

            if beam.mm_processor_kwargs is not None:
                token_prompt_kwargs[
                    "mm_processor_kwargs"] = beam.mm_processor_kwargs
            return TokensPrompt(**token_prompt_kwargs)

        # generate 2 * beam_width candidates at each step
        # following the huggingface transformers implementation
        # at https://github.com/huggingface/transformers/blob/e15687fffe5c9d20598a19aeab721ae0a7580f8a/src/transformers/generation/beam_search.py#L534 # noqa
        beam_search_params = SamplingParams(logprobs=2 * beam_width,
                                            max_tokens=1,
                                            temperature=temperature)
        instances: list[BeamSearchInstance] = []

        for lora_req, prompt in zip(lora_requests, prompts):
            # Add multimodal processor kwargs & data
            mm_kwargs = {}
            if "multi_modal_data" in prompt:
                mm_kwargs["multi_modal_data"] = prompt["multi_modal_data"]
            if "mm_processor_kwargs" in prompt:
                mm_kwargs["mm_processor_kwargs"] = prompt[
                    "mm_processor_kwargs"]

            if "prompt_token_ids" in prompt:
                prompt = cast(TokensPrompt, prompt)  # Needed for mypy
                prompt_tokens = prompt["prompt_token_ids"]
            else:
                prompt_tokens = tokenizer.encode(prompt["prompt"])

            instances.append(
                BeamSearchInstance(
                    prompt_tokens,
                    lora_request=lora_req,
                    logprobs=None,
                    **mm_kwargs,
                ), )

        token_iter = range(max_tokens)
        if use_tqdm:
            token_iter = tqdm(token_iter,
                              desc="Beam search",
                              unit="token",
                              unit_scale=False)
            logger.warning(
                "The progress bar shows the upper bound on token steps and "
                "may finish early due to stopping conditions. It does not "
                "reflect instance-level progress.")

        for _ in token_iter:
            all_beams: list[BeamSearchSequence] = list(
                sum((instance.beams for instance in instances), []))
            pos = [0] + list(
                itertools.accumulate(
                    len(instance.beams) for instance in instances))
            instance_start_and_end: list[tuple[int, int]] = list(
                zip(pos[:-1], pos[1:]))

            if len(all_beams) == 0:
                break

            # create the corresponding batch entries for prompt & optional lora
            prompts_batch, lora_req_batch = zip(
                *[(create_tokens_prompt_from_beam(beam), beam.lora_request)
                  for beam in all_beams])

            # only runs for one step
            # we don't need to use tqdm here
            output = self.generate(prompts_batch,
                                   sampling_params=beam_search_params,
                                   use_tqdm=False,
                                   lora_request=lora_req_batch)

            for (start, end), instance in zip(instance_start_and_end,
                                              instances):
                instance_new_beams = []
                for i in range(start, end):
                    current_beam = all_beams[i]
                    result = output[i]

                    if result.outputs[0].logprobs is not None:
                        # if `result.outputs[0].logprobs` is None, it means
                        # the sequence is completed because of the max-model-len
                        # or abortion. we don't need to add it to the new beams.
                        logprobs = result.outputs[0].logprobs[0]
                        for token_id, logprob_obj in logprobs.items():
                            new_beam = BeamSearchSequence(
                                tokens=current_beam.tokens + [token_id],
                                logprobs=current_beam.logprobs + [logprobs],
                                lora_request=current_beam.lora_request,
                                cum_logprob=current_beam.cum_logprob +
                                logprob_obj.logprob,
                                multi_modal_data=current_beam.multi_modal_data,
                                mm_processor_kwargs=current_beam.
                                mm_processor_kwargs)

                            if token_id == tokenizer.eos_token_id and \
                                not ignore_eos:
                                instance.completed.append(new_beam)
                            else:
                                instance_new_beams.append(new_beam)
                sorted_beams = sorted(instance_new_beams,
                                      key=sort_beams_key,
                                      reverse=True)
                instance.beams = sorted_beams[:beam_width]

        outputs = []
        for instance in instances:
            instance.completed.extend(instance.beams)
            sorted_completed = sorted(instance.completed,
                                      key=sort_beams_key,
                                      reverse=True)
            best_beams = sorted_completed[:beam_width]

            for beam in best_beams:
                beam.text = tokenizer.decode(beam.tokens)
            outputs.append(BeamSearchOutput(sequences=best_beams))

        return outputs

    def chat(
        self,
        messages: Union[list[ChatCompletionMessageParam],
                        list[list[ChatCompletionMessageParam]]],
        sampling_params: Optional[Union[SamplingParams,
                                        list[SamplingParams]]] = None,
        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
        lora_request: Optional[LoRARequest] = None,
        chat_template: Optional[str] = None,
        chat_template_content_format: ChatTemplateContentFormatOption = "auto",
        add_generation_prompt: bool = True,
        continue_final_message: bool = False,
        tools: Optional[list[dict[str, Any]]] = None,
        chat_template_kwargs: Optional[dict[str, Any]] = None,
        mm_processor_kwargs: Optional[dict[str, Any]] = None,
    ) -> list[RequestOutput]:
        """
        Generate responses for a chat conversation.

        The chat conversation is converted into a text prompt using the
        tokenizer and calls the [generate][] method to generate the
        responses.

        Multi-modal inputs can be passed in the same way you would pass them
        to the OpenAI API.

        Args:
            messages: A list of conversations or a single conversation.

                - Each conversation is represented as a list of messages.
                - Each message is a dictionary with 'role' and 'content' keys.

            sampling_params: The sampling parameters for text generation.
                If None, we use the default sampling parameters. When it
                is a single value, it is applied to every prompt. When it
                is a list, the list must have the same length as the
                prompts and it is paired one by one with the prompt.
            use_tqdm: If `True`, shows a tqdm progress bar.
                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
                it is used to create the progress bar.
                If `False`, no progress bar is created.
            lora_request: LoRA request to use for generation, if any.
            chat_template: The template to use for structuring the chat.
                If not provided, the model's default chat template will be used.
            chat_template_content_format: The format to render message content.

                - "string" will render the content as a string.
                  Example: `"Who are you?"`
                - "openai" will render the content as a list of dictionaries,
                  similar to OpenAI schema.
                  Example: `[{"type": "text", "text": "Who are you?"}]`

            add_generation_prompt: If True, adds a generation template
                to each message.
            continue_final_message: If True, continues the final message in
                the conversation instead of starting a new one. Cannot be
                `True` if `add_generation_prompt` is also `True`.
            chat_template_kwargs: Additional kwargs to pass to the chat
                template.
            mm_processor_kwargs: Multimodal processor kwarg overrides for this
                chat request. Only used for offline requests.

        Returns:
            A list of `RequestOutput` objects containing the generated
            responses in the same order as the input messages.
        """
        list_of_messages: list[list[ChatCompletionMessageParam]]

        # Handle multi and single conversations
        if is_list_of(messages, list):
            # messages is list[list[...]]
            list_of_messages = cast(list[list[ChatCompletionMessageParam]],
                                    messages)
        else:
            # messages is list[...]
            list_of_messages = [
                cast(list[ChatCompletionMessageParam], messages)
            ]

        tokenizer = self.get_tokenizer(lora_request)
        model_config = self.llm_engine.get_model_config()
        resolved_content_format = resolve_chat_template_content_format(
            chat_template,
            tools,
            chat_template_content_format,
            tokenizer,
            model_config=model_config,
        )

        _chat_template_kwargs: dict[str, Any] = dict(
            chat_template=chat_template,
            add_generation_prompt=add_generation_prompt,
            continue_final_message=continue_final_message,
            tools=tools,
        )
        _chat_template_kwargs.update(chat_template_kwargs or {})

        prompts: list[Union[TokensPrompt, TextPrompt]] = []

        for msgs in list_of_messages:
            # NOTE: _parse_chat_message_content_parts() currently doesn't
            # handle mm_processor_kwargs, since there is no implementation in
            # the chat message parsing for it.
            conversation, mm_data = parse_chat_messages(
                msgs,
                model_config,
                tokenizer,
                content_format=resolved_content_format,
            )

            if isinstance(tokenizer, MistralTokenizer):
                prompt_token_ids = apply_mistral_chat_template(
                    tokenizer,
                    messages=msgs,
                    **_chat_template_kwargs,
                )
            else:
                prompt_str = apply_hf_chat_template(
                    tokenizer=tokenizer,
                    conversation=conversation,
                    model_config=model_config,
                    **_chat_template_kwargs,
                )
                # Special tokens are already included in chat templates so
                # should not be added by the tokenizer in this case.
                prompt_token_ids = tokenizer.encode(prompt_str,
                                                    add_special_tokens=False)

            prompt = TokensPrompt(prompt_token_ids=prompt_token_ids)

            if mm_data is not None:
                prompt["multi_modal_data"] = mm_data

            if mm_processor_kwargs is not None:
                prompt["mm_processor_kwargs"] = mm_processor_kwargs

            prompts.append(prompt)

        return self.generate(
            prompts,
            sampling_params=sampling_params,
            use_tqdm=use_tqdm,
            lora_request=lora_request,
        )

    def encode(
        self,
        prompts: Union[PromptType, Sequence[PromptType]],
        pooling_params: Optional[Union[PoolingParams,
                                       Sequence[PoolingParams]]] = None,
        *,
        truncate_prompt_tokens: Optional[int] = None,
        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
        pooling_task: PoolingTask = "encode",
        tokenization_kwargs: Optional[dict[str, Any]] = None,
    ) -> list[PoolingRequestOutput]:
        """Apply pooling to the hidden states corresponding to the input
        prompts.

        This class automatically batches the given prompts, considering
        the memory constraint. For the best performance, put all of your prompts
        into a single list and pass it to this method.

        Args:
            prompts: The prompts to the LLM. You may pass a sequence of prompts
                for batch inference. See [PromptType][vllm.inputs.PromptType]
                for more details about the format of each prompts.
            pooling_params: The pooling parameters for pooling. If None, we
                use the default pooling parameters.
            use_tqdm: If `True`, shows a tqdm progress bar.
                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
                it is used to create the progress bar.
                If `False`, no progress bar is created.
            lora_request: LoRA request to use for generation, if any.
            pooling_task: Override the pooling task to use.

        Returns:
            A list of `PoolingRequestOutput` objects containing the
            pooled hidden states in the same order as the input prompts.

        Note:
            Using `prompts` and `prompt_token_ids` as keyword parameters is
            considered legacy and may be deprecated in the future. You should
            instead pass them via the `inputs` parameter.
        """
        if pooling_task is None:
            if "embed" in self.supported_tasks:
                pooling_task = "embed"
            else:
                pooling_task = "encode"

            logger.warning_once(
                "`LLM.encode` is currently using `pooling_task = %s`.\n"
                "Please use one of the more specific methods or set the "
                "task directly when using `LLM.encode`:\n"
                "  - For embeddings, use `LLM.embed(...)` "
                "or `pooling_task=\"embed\"`.\n"
                "  - For classification logits, use `LLM.classify(...)` "
                "or `pooling_task=\"classify\"`.\n"
                "  - For rewards, use `LLM.reward(...)` "
                "or `pooling_task=\"reward\"`\n"
                "  - For similarity scores, use `LLM.score(...)`.",
                pooling_task)

        model_config = self.llm_engine.model_config
        runner_type = model_config.runner_type
        if runner_type != "pooling":
            raise ValueError(
                "LLM.encode() is only supported for pooling models. "
                "Try passing `--runner pooling` to use the model as a "
                "pooling model.")

        if pooling_task not in self.supported_tasks:
            raise ValueError(
                f"pooling_task must be one of {self.supported_tasks}.")

        if pooling_params is None:
            # Use default pooling params.
            pooling_params = PoolingParams()

        if isinstance(pooling_params, PoolingParams):
            pooling_params.verify(pooling_task, model_config)
        else:
            for pooling_param in pooling_params:
                pooling_param.verify(pooling_task, model_config)

        if tokenization_kwargs is None:
            tokenization_kwargs = dict[str, Any]()
            _validate_truncation_size(model_config.max_model_len,
                                      truncate_prompt_tokens,
                                      tokenization_kwargs)

        self._validate_and_add_requests(
            prompts=prompts,
            params=pooling_params,
            use_tqdm=use_tqdm,
            lora_request=lora_request,
            tokenization_kwargs=tokenization_kwargs,
        )

        outputs = self._run_engine(use_tqdm=use_tqdm)
        return self.engine_class.validate_outputs(outputs,
                                                  PoolingRequestOutput)

    def embed(
        self,
        prompts: Union[PromptType, Sequence[PromptType]],
        *,
        truncate_prompt_tokens: Optional[int] = None,
        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
        pooling_params: Optional[Union[PoolingParams,
                                       Sequence[PoolingParams]]] = None,
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
    ) -> list[EmbeddingRequestOutput]:
        """
        Generate an embedding vector for each prompt.

        This class automatically batches the given prompts, considering
        the memory constraint. For the best performance, put all of your prompts
        into a single list and pass it to this method.

        Args:
            prompts: The prompts to the LLM. You may pass a sequence of prompts
                for batch inference. See [PromptType][vllm.inputs.PromptType]
                for more details about the format of each prompts.
            pooling_params: The pooling parameters for pooling. If None, we
                use the default pooling parameters.
            use_tqdm: If `True`, shows a tqdm progress bar.
                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
                it is used to create the progress bar.
                If `False`, no progress bar is created.
            lora_request: LoRA request to use for generation, if any.

        Returns:
            A list of `EmbeddingRequestOutput` objects containing the
            embedding vectors in the same order as the input prompts.
        """
        if "embed" not in self.supported_tasks:
            raise ValueError(
                "Embedding API is not supported by this model. "
                "Try converting the model using `--convert embed`.")

        items = self.encode(
            prompts,
            truncate_prompt_tokens=truncate_prompt_tokens,
            use_tqdm=use_tqdm,
            pooling_params=pooling_params,
            lora_request=lora_request,
            pooling_task="embed",
        )

        return [EmbeddingRequestOutput.from_base(item) for item in items]

    def classify(
        self,
        prompts: Union[PromptType, Sequence[PromptType]],
        *,
        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
        pooling_params: Optional[Union[PoolingParams,
                                       Sequence[PoolingParams]]] = None,
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
    ) -> list[ClassificationRequestOutput]:
        """
        Generate class logits for each prompt.

        This class automatically batches the given prompts, considering
        the memory constraint. For the best performance, put all of your prompts
        into a single list and pass it to this method.

        Args:
            prompts: The prompts to the LLM. You may pass a sequence of prompts
                for batch inference. See [PromptType][vllm.inputs.PromptType]
                for more details about the format of each prompts.
            use_tqdm: If `True`, shows a tqdm progress bar.
                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
                it is used to create the progress bar.
                If `False`, no progress bar is created.
            lora_request: LoRA request to use for generation, if any.
            pooling_params: The pooling parameters for pooling. If None, we
                use the default pooling parameters.
        Returns:
            A list of `ClassificationRequestOutput` objects containing the
            embedding vectors in the same order as the input prompts.
        """
        if "classify" not in self.supported_tasks:
            raise ValueError(
                "Classification API is not supported by this model. "
                "Try converting the model using `--convert classify`.")

        items = self.encode(
            prompts,
            use_tqdm=use_tqdm,
            pooling_params=pooling_params,
            lora_request=lora_request,
            pooling_task="classify",
        )

        return [ClassificationRequestOutput.from_base(item) for item in items]

    def reward(
        self,
        prompts: Union[PromptType, Sequence[PromptType]],
        /,
        *,
        truncate_prompt_tokens: Optional[int] = None,
        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
        pooling_params: Optional[Union[PoolingParams,
                                       Sequence[PoolingParams]]] = None,
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
    ) -> list[PoolingRequestOutput]:
        """
        Generate rewards for each prompt.

        Args:
            prompts: The prompts to the LLM. You may pass a sequence of prompts
                for batch inference. See [PromptType][vllm.inputs.PromptType]
                for more details about the format of each prompts.
            use_tqdm: If `True`, shows a tqdm progress bar.
                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
                it is used to create the progress bar.
                If `False`, no progress bar is created.
            lora_request: LoRA request to use for generation, if any.
            pooling_params: The pooling parameters for pooling. If None, we
                use the default pooling parameters.
        Returns:
            A list of `PoolingRequestOutput` objects containing the
            pooled hidden states in the same order as the input prompts.
        """

        return self.encode(
            prompts,
            use_tqdm=use_tqdm,
            lora_request=lora_request,
            pooling_params=pooling_params,
            truncate_prompt_tokens=truncate_prompt_tokens,
            pooling_task="encode",
        )

    def _embedding_score(
        self,
        tokenizer: AnyTokenizer,
        text_1: list[Union[str, TextPrompt, TokensPrompt]],
        text_2: list[Union[str, TextPrompt, TokensPrompt]],
        truncate_prompt_tokens: Optional[int] = None,
        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
        pooling_params: Optional[PoolingParams] = None,
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
    ) -> list[ScoringRequestOutput]:

        encoded_output: list[PoolingRequestOutput] = self.encode(
            text_1 + text_2,
            truncate_prompt_tokens=truncate_prompt_tokens,
            use_tqdm=use_tqdm,
            lora_request=lora_request,
            pooling_params=pooling_params,
            pooling_task="embed",
        )

        encoded_output_1: list[PoolingRequestOutput] = encoded_output[
            0:len(text_1)]
        encoded_output_2: list[PoolingRequestOutput] = encoded_output[
            len(text_1):]

        if len(encoded_output_1) == 1:
            encoded_output_1 = encoded_output_1 * len(encoded_output_2)

        scores = _cosine_similarity(tokenizer=tokenizer,
                                    embed_1=encoded_output_1,
                                    embed_2=encoded_output_2)

        items = self.engine_class.validate_outputs(scores,
                                                   PoolingRequestOutput)
        return [ScoringRequestOutput.from_base(item) for item in items]

    def _cross_encoding_score(
        self,
        tokenizer: AnyTokenizer,
        data_1: Union[list[str], list[ScoreContentPartParam]],
        data_2: Union[list[str], list[ScoreContentPartParam]],
        truncate_prompt_tokens: Optional[int] = None,
        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
        pooling_params: Optional[PoolingParams] = None,
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
    ) -> list[ScoringRequestOutput]:
        model_config = self.llm_engine.model_config

        if isinstance(tokenizer, MistralTokenizer):
            raise ValueError(
                "Score API is not supported for Mistral tokenizer")

        if len(data_1) == 1:
            data_1 = data_1 * len(data_2)

        if pooling_params is None:
            pooling_params = PoolingParams(task="score")

        model_config = self.llm_engine.model_config
        pooling_params.verify("score", model_config)
        pooling_params_list = list[PoolingParams]()

        tokenization_kwargs: dict[str, Any] = {}

        _validate_truncation_size(model_config.max_model_len,
                                  truncate_prompt_tokens, tokenization_kwargs)

        prompts = list[PromptType]()

        input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]

        model_config = self.llm_engine.model_config

        for q, d in input_pairs:
            _, engine_prompt = get_score_prompt(
                model_config=model_config,
                data_1=q,
                data_2=d,
                tokenizer=tokenizer,
                tokenization_kwargs=tokenization_kwargs,
            )

            if envs.VLLM_USE_V1 and (token_type_ids := engine_prompt.pop(
                    "token_type_ids", None)):
                params = pooling_params.clone()
                compressed = compress_token_type_ids(token_type_ids)
                params.extra_kwargs = {"compressed_token_type_ids": compressed}
                pooling_params_list.append(params)
            else:
                pooling_params_list.append(pooling_params)

            prompts.append(engine_prompt)

        self._validate_and_add_requests(
            prompts=prompts,
            params=pooling_params_list,
            use_tqdm=use_tqdm,
            lora_request=lora_request,
        )

        outputs = self._run_engine(use_tqdm=use_tqdm)
        items = self.engine_class.validate_outputs(outputs,
                                                   PoolingRequestOutput)

        return [ScoringRequestOutput.from_base(item) for item in items]

    def score(
        self,
        data_1: Union[SingletonPrompt, Sequence[SingletonPrompt],
                      ScoreMultiModalParam],
        data_2: Union[SingletonPrompt, Sequence[SingletonPrompt],
                      ScoreMultiModalParam],
        /,
        *,
        truncate_prompt_tokens: Optional[int] = None,
        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
        pooling_params: Optional[PoolingParams] = None,
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
    ) -> list[ScoringRequestOutput]:
        """Generate similarity scores for all pairs `<text,text_pair>` or
          `<multi-modal data, multi-modal data pair>`.

        The inputs can be `1 -> 1`, `1 -> N` or `N -> N`.
        In the `1 - N` case the `data_1` input will be replicated `N`
        times to pair with the `data_2` inputs.
        The input pairs are used to build a list of prompts for the
        cross encoder model. This class automatically batches the prompts,
        considering the memory constraint. For the best performance, put all
        of your inputs into a single list and pass it to this method.

        Supports both text and multi-modal data (images, etc.) when used with
        appropriate multi-modal models. For multi-modal inputs, ensure the
        prompt structure matches the model's expected input format.

        Args:
            data_1: Can be a single prompt, a list of prompts or
                `ScoreMultiModalParam`, which can contain either text or
                multi-modal data. When a list, it must have the same length as
                the `data_2` list.
            data_2: The data to pair with the query to form the input to
                the LLM. Can be text or multi-modal data. See [PromptType]
                [vllm.inputs.PromptType] for more details about the format of
                each prompt.
            use_tqdm: If `True`, shows a tqdm progress bar.
                If a callable (e.g., `functools.partial(tqdm, leave=False)`),
                it is used to create the progress bar.
                If `False`, no progress bar is created.
            lora_request: LoRA request to use for generation, if any.
            pooling_params: The pooling parameters for pooling. If None, we
                use the default pooling parameters.
        Returns:
            A list of `ScoringRequestOutput` objects containing the
            generated scores in the same order as the input prompts.
        """
        model_config = self.llm_engine.model_config
        runner_type = model_config.runner_type
        if runner_type != "pooling":
            raise ValueError(
                "LLM.score() is only supported for pooling models. "
                "Try passing `--runner pooling` to use the model as a "
                "pooling model.")

        supported_tasks = self.supported_tasks
        if all(t not in supported_tasks for t in ("embed", "classify")):
            raise ValueError("Score API is not supported by this model. "
                             "Try converting the model using "
                             "`--convert embed` or `--convert classify`.")

        if (model_config.is_cross_encoder
                and getattr(model_config.hf_config, "num_labels", 0) != 1):
            raise ValueError("Score API is only enabled for num_labels == 1.")

        # the tokenizer for models such as
        # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing
        # lists of tokens to the `text` and `text_pair` kwargs
        tokenizer = self.get_tokenizer()

        if not model_config.is_multimodal_model:

            def check_data_type(data: Union[SingletonPrompt,
                                            Sequence[SingletonPrompt],
                                            ScoreMultiModalParam]):
                if isinstance(data, dict) and "content" in data:
                    raise ValueError("ScoreMultiModalParam is not supported "
                                     f"for {model_config.architecture}")

            check_data_type(data_1)
            check_data_type(data_2)

            def ensure_str(prompt: SingletonPrompt):
                if isinstance(prompt, dict):
                    if "multi_modal_data" in prompt:
                        raise ValueError("Multi-modal prompt is not "
                                         "supported for scoring")
                    elif "prompt_token_ids" in prompt:
                        prompt = tokenizer.decode(
                            cast(TokensPrompt, prompt)["prompt_token_ids"])
                    elif "prompt" in prompt:
                        prompt = cast(TextPrompt, prompt)["prompt"]
                assert type(prompt) is str
                return prompt

            if isinstance(data_1, (str, dict)):
                # Convert a single prompt to a list.
                data_1 = [data_1]  # type: ignore[list-item]

            data_1 = [ensure_str(t) for t in data_1]

            if isinstance(data_2, (str, dict)):
                # Convert a single prompt to a list.
                data_2 = [data_2]  # type: ignore[list-item]

            data_2 = [ensure_str(t) for t in data_2]

        if isinstance(data_1, dict) and "content" in data_1:
            data_1 = data_1.get("content")  # type: ignore[assignment]
        elif isinstance(data_1, str):
            data_1 = [data_1]

        if isinstance(data_2, dict) and "content" in data_2:
            data_2 = data_2.get("content")  # type: ignore[assignment]
        elif isinstance(data_2, str):
            data_2 = [data_2]

        _validate_score_input_lens(data_1, data_2)  # type: ignore[arg-type]

        if model_config.is_cross_encoder:
            return self._cross_encoding_score(
                tokenizer,
                data_1,  # type: ignore[arg-type]
                data_2,  # type: ignore[arg-type]
                truncate_prompt_tokens,
                use_tqdm,
                pooling_params,
                lora_request)
        else:
            return self._embedding_score(
                tokenizer,
                data_1,  # type: ignore[arg-type]
                data_2,  # type: ignore[arg-type]
                truncate_prompt_tokens,
                use_tqdm,
                pooling_params,
                lora_request)

    def start_profile(self) -> None:
        self.llm_engine.start_profile()

    def stop_profile(self) -> None:
        self.llm_engine.stop_profile()

    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
        return self.llm_engine.reset_prefix_cache(device)

    def sleep(self, level: int = 1):
        """
        Put the engine to sleep. The engine should not process any requests.
        The caller should guarantee that no requests are being processed
        during the sleep period, before `wake_up` is called.

        Args:
            level: The sleep level. Level 1 sleep will offload the model
                weights and discard the kv cache. The content of kv cache
                is forgotten. Level 1 sleep is good for sleeping and waking
                up the engine to run the same model again. The model weights
                are backed up in CPU memory. Please make sure there's enough
                CPU memory to store the model weights. Level 2 sleep will
                discard both the model weights and the kv cache. The content
                of both the model weights and kv cache is forgotten. Level 2
                sleep is good for sleeping and waking up the engine to run a
                different model or update the model, where previous model
                weights are not needed. It reduces CPU memory pressure.
        """
        self.reset_prefix_cache()
        self.llm_engine.sleep(level=level)

    def wake_up(self, tags: Optional[list[str]] = None):
        """
        Wake up the engine from sleep mode. See the [sleep][] method
        for more details.

        Args:
            tags: An optional list of tags to reallocate the engine memory
                for specific memory allocations. Values must be in
                `("weights", "kv_cache")`. If None, all memory is reallocated.
                wake_up should be called with all tags (or None) before the
                engine is used again.
        """
        self.llm_engine.wake_up(tags)

    def get_metrics(self) -> list["Metric"]:
        """Return a snapshot of aggregated metrics from Prometheus.

        Returns:
            A ``MetricSnapshot`` instance capturing the current state
            of all aggregated metrics from Prometheus.

        Note:
            This method is only available with the V1 LLM engine.
        """
        from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
        assert isinstance(self.llm_engine, V1LLMEngine)
        return self.llm_engine.get_metrics()

    def _validate_and_add_requests(
        self,
        prompts: Union[PromptType, Sequence[PromptType]],
        params: Union[SamplingParams, Sequence[SamplingParams], PoolingParams,
                      Sequence[PoolingParams]],
        *,
        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
        lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
        tokenization_kwargs: Optional[dict[str, Any]] = None,
        priority: Optional[list[int]] = None,
    ) -> None:
        if isinstance(prompts, (str, dict)):
            # Convert a single prompt to a list.
            prompts = [prompts]

        num_requests = len(prompts)
        if isinstance(params, Sequence) and len(params) != num_requests:
            raise ValueError("The lengths of prompts and params "
                             "must be the same.")
        if isinstance(lora_request,
                      Sequence) and len(lora_request) != num_requests:
            raise ValueError("The lengths of prompts and lora_request "
                             "must be the same.")

        for sp in params if isinstance(params, Sequence) else (params, ):
            if isinstance(sp, SamplingParams):
                # We only care about the final output
                sp.output_kind = RequestOutputKind.FINAL_ONLY

        # Add requests to the engine.
        it = prompts
        if use_tqdm:
            tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
            it = tqdm_func(it, desc="Adding requests")

        for i, prompt in enumerate(it):
            self._add_request(
                prompt,
                params[i] if isinstance(params, Sequence) else params,
                tokenization_kwargs=tokenization_kwargs,
                lora_request=lora_request[i] if isinstance(
                    lora_request, Sequence) else lora_request,
                priority=priority[i] if priority else 0,
            )

    def _add_request(
        self,
        prompt: PromptType,
        params: Union[SamplingParams, PoolingParams],
        tokenization_kwargs: Optional[dict[str, Any]] = None,
        lora_request: Optional[LoRARequest] = None,
        priority: int = 0,
    ) -> None:
        request_id = str(next(self.request_counter))
        self.llm_engine.add_request(
            request_id,
            prompt,
            params,
            lora_request=lora_request,
            tokenization_kwargs=tokenization_kwargs,
            priority=priority,
        )

    def _run_engine(
        self,
        *,
        use_tqdm: Union[bool, Callable[..., tqdm]] = True
    ) -> list[Union[RequestOutput, PoolingRequestOutput]]:
        # Initialize tqdm.
        if use_tqdm:
            num_requests = self.llm_engine.get_num_unfinished_requests()
            tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
            pbar = tqdm_func(
                total=num_requests,
                desc="Processed prompts",
                dynamic_ncols=True,
                postfix=(f"est. speed input: {0:.2f} toks/s, "
                         f"output: {0:.2f} toks/s"),
            )

        # Run the engine.
        outputs: list[Union[RequestOutput, PoolingRequestOutput]] = []
        total_in_toks = 0
        total_out_toks = 0
        while self.llm_engine.has_unfinished_requests():
            step_outputs = self.llm_engine.step()
            for output in step_outputs:
                if output.finished:
                    outputs.append(output)
                    if use_tqdm:
                        if isinstance(output, RequestOutput):
                            # Calculate tokens only for RequestOutput
                            n = len(output.outputs)
                            assert output.prompt_token_ids is not None
                            total_in_toks += len(output.prompt_token_ids) * n
                            in_spd = total_in_toks / pbar.format_dict["elapsed"]
                            total_out_toks += sum(
                                len(stp.token_ids) for stp in output.outputs)
                            out_spd = (total_out_toks /
                                       pbar.format_dict["elapsed"])
                            pbar.postfix = (
                                f"est. speed input: {in_spd:.2f} toks/s, "
                                f"output: {out_spd:.2f} toks/s")
                            pbar.update(n)
                        else:
                            pbar.update(1)
                        if pbar.n == num_requests:
                            pbar.refresh()

        if use_tqdm:
            pbar.close()
        # Sort the outputs by request ID.
        # This is necessary because some requests may be finished earlier than
        # its previous requests.
        return sorted(outputs, key=lambda x: int(x.request_id))

default_sampling_params instance-attribute

default_sampling_params: Union[dict[str, Any], None] = None

engine_class instance-attribute

engine_class = type(llm_engine)

llm_engine instance-attribute

llm_engine = from_engine_args(
    engine_args=engine_args, usage_context=LLM_CLASS
)

request_counter instance-attribute

request_counter = Counter()

supported_tasks instance-attribute

supported_tasks = supported_tasks

__init__

__init__(
    model: str,
    *,
    runner: RunnerOption = "auto",
    convert: ConvertOption = "auto",
    tokenizer: Optional[str] = None,
    tokenizer_mode: TokenizerMode = "auto",
    skip_tokenizer_init: bool = False,
    trust_remote_code: bool = False,
    allowed_local_media_path: str = "",
    tensor_parallel_size: int = 1,
    dtype: ModelDType = "auto",
    quantization: Optional[QuantizationMethods] = None,
    revision: Optional[str] = None,
    tokenizer_revision: Optional[str] = None,
    seed: Optional[int] = None,
    gpu_memory_utilization: float = 0.9,
    swap_space: float = 4,
    cpu_offload_gb: float = 0,
    enforce_eager: bool = False,
    max_seq_len_to_capture: int = 8192,
    disable_custom_all_reduce: bool = False,
    disable_async_output_proc: bool = False,
    hf_token: Optional[Union[bool, str]] = None,
    hf_overrides: Optional[HfOverrides] = None,
    mm_processor_kwargs: Optional[dict[str, Any]] = None,
    override_pooler_config: Optional[PoolerConfig] = None,
    compilation_config: Optional[
        Union[int, dict[str, Any], CompilationConfig]
    ] = None,
    logits_processors: Optional[
        list[Union[str, type[LogitsProcessor]]]
    ] = None,
    **kwargs,
) -> None

LLM constructor.

Source code in vllm/entrypoints/llm.py
def __init__(
    self,
    model: str,
    *,
    runner: RunnerOption = "auto",
    convert: ConvertOption = "auto",
    tokenizer: Optional[str] = None,
    tokenizer_mode: TokenizerMode = "auto",
    skip_tokenizer_init: bool = False,
    trust_remote_code: bool = False,
    allowed_local_media_path: str = "",
    tensor_parallel_size: int = 1,
    dtype: ModelDType = "auto",
    quantization: Optional[QuantizationMethods] = None,
    revision: Optional[str] = None,
    tokenizer_revision: Optional[str] = None,
    seed: Optional[int] = None,
    gpu_memory_utilization: float = 0.9,
    swap_space: float = 4,
    cpu_offload_gb: float = 0,
    enforce_eager: bool = False,
    max_seq_len_to_capture: int = 8192,
    disable_custom_all_reduce: bool = False,
    disable_async_output_proc: bool = False,
    hf_token: Optional[Union[bool, str]] = None,
    hf_overrides: Optional[HfOverrides] = None,
    mm_processor_kwargs: Optional[dict[str, Any]] = None,
    override_pooler_config: Optional[PoolerConfig] = None,
    compilation_config: Optional[Union[int, dict[str, Any],
                                       CompilationConfig]] = None,
    logits_processors: Optional[list[Union[str,
                                           type[LogitsProcessor]]]] = None,
    **kwargs,
) -> None:
    """LLM constructor."""

    if "disable_log_stats" not in kwargs:
        kwargs["disable_log_stats"] = True

    if "worker_cls" in kwargs:
        worker_cls = kwargs["worker_cls"]
        # if the worker_cls is not qualified string name,
        # we serialize it using cloudpickle to avoid pickling issues
        if isinstance(worker_cls, type):
            kwargs["worker_cls"] = cloudpickle.dumps(worker_cls)

    if "kv_transfer_config" in kwargs and isinstance(
            kwargs["kv_transfer_config"], dict):
        from vllm.config import KVTransferConfig
        raw_config_dict = kwargs["kv_transfer_config"]
        try:
            kwargs["kv_transfer_config"] = KVTransferConfig(
                **raw_config_dict)
        except ValidationError as e:
            logger.error(
                "Failed to convert 'kv_transfer_config' dict to "
                "KVTransferConfig object. Dict: %s. Error: %s",
                raw_config_dict, e)
            # Consider re-raising a more specific vLLM error or ValueError
            # to provide better context to the user.
            raise ValueError(
                f"Invalid 'kv_transfer_config' provided: {e}") from e

    if hf_overrides is None:
        hf_overrides = {}

    if compilation_config is not None:
        if isinstance(compilation_config, int):
            compilation_config_instance = CompilationConfig(
                level=compilation_config)
        elif isinstance(compilation_config, dict):
            predicate = lambda x: is_init_field(CompilationConfig, x[0])
            compilation_config_instance = CompilationConfig(
                **dict(filter(predicate, compilation_config.items())))
        else:
            compilation_config_instance = compilation_config
    else:
        compilation_config_instance = CompilationConfig()

    engine_args = EngineArgs(
        model=model,
        runner=runner,
        convert=convert,
        tokenizer=tokenizer,
        tokenizer_mode=tokenizer_mode,
        skip_tokenizer_init=skip_tokenizer_init,
        trust_remote_code=trust_remote_code,
        allowed_local_media_path=allowed_local_media_path,
        tensor_parallel_size=tensor_parallel_size,
        dtype=dtype,
        quantization=quantization,
        revision=revision,
        tokenizer_revision=tokenizer_revision,
        seed=seed,
        gpu_memory_utilization=gpu_memory_utilization,
        swap_space=swap_space,
        cpu_offload_gb=cpu_offload_gb,
        enforce_eager=enforce_eager,
        max_seq_len_to_capture=max_seq_len_to_capture,
        disable_custom_all_reduce=disable_custom_all_reduce,
        disable_async_output_proc=disable_async_output_proc,
        hf_token=hf_token,
        hf_overrides=hf_overrides,
        mm_processor_kwargs=mm_processor_kwargs,
        override_pooler_config=override_pooler_config,
        compilation_config=compilation_config_instance,
        logits_processors=logits_processors,
        **kwargs,
    )

    log_non_default_args(engine_args)

    # Create the Engine (autoselects V0 vs V1)
    self.llm_engine = LLMEngine.from_engine_args(
        engine_args=engine_args, usage_context=UsageContext.LLM_CLASS)
    self.engine_class = type(self.llm_engine)

    self.request_counter = Counter()
    self.default_sampling_params: Union[dict[str, Any], None] = None

    if envs.VLLM_USE_V1:
        supported_tasks = self.llm_engine \
            .get_supported_tasks()  # type: ignore
    else:
        supported_tasks = self.llm_engine.model_config.supported_tasks

    logger.info("Supported_tasks: %s", supported_tasks)

    self.supported_tasks = supported_tasks

_add_request

_add_request(
    prompt: PromptType,
    params: Union[SamplingParams, PoolingParams],
    tokenization_kwargs: Optional[dict[str, Any]] = None,
    lora_request: Optional[LoRARequest] = None,
    priority: int = 0,
) -> None
Source code in vllm/entrypoints/llm.py
def _add_request(
    self,
    prompt: PromptType,
    params: Union[SamplingParams, PoolingParams],
    tokenization_kwargs: Optional[dict[str, Any]] = None,
    lora_request: Optional[LoRARequest] = None,
    priority: int = 0,
) -> None:
    request_id = str(next(self.request_counter))
    self.llm_engine.add_request(
        request_id,
        prompt,
        params,
        lora_request=lora_request,
        tokenization_kwargs=tokenization_kwargs,
        priority=priority,
    )

_cross_encoding_score

_cross_encoding_score(
    tokenizer: AnyTokenizer,
    data_1: Union[list[str], list[ScoreContentPartParam]],
    data_2: Union[list[str], list[ScoreContentPartParam]],
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[PoolingParams] = None,
    lora_request: Optional[
        Union[list[LoRARequest], LoRARequest]
    ] = None,
) -> list[ScoringRequestOutput]
Source code in vllm/entrypoints/llm.py
def _cross_encoding_score(
    self,
    tokenizer: AnyTokenizer,
    data_1: Union[list[str], list[ScoreContentPartParam]],
    data_2: Union[list[str], list[ScoreContentPartParam]],
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[PoolingParams] = None,
    lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
) -> list[ScoringRequestOutput]:
    model_config = self.llm_engine.model_config

    if isinstance(tokenizer, MistralTokenizer):
        raise ValueError(
            "Score API is not supported for Mistral tokenizer")

    if len(data_1) == 1:
        data_1 = data_1 * len(data_2)

    if pooling_params is None:
        pooling_params = PoolingParams(task="score")

    model_config = self.llm_engine.model_config
    pooling_params.verify("score", model_config)
    pooling_params_list = list[PoolingParams]()

    tokenization_kwargs: dict[str, Any] = {}

    _validate_truncation_size(model_config.max_model_len,
                              truncate_prompt_tokens, tokenization_kwargs)

    prompts = list[PromptType]()

    input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]

    model_config = self.llm_engine.model_config

    for q, d in input_pairs:
        _, engine_prompt = get_score_prompt(
            model_config=model_config,
            data_1=q,
            data_2=d,
            tokenizer=tokenizer,
            tokenization_kwargs=tokenization_kwargs,
        )

        if envs.VLLM_USE_V1 and (token_type_ids := engine_prompt.pop(
                "token_type_ids", None)):
            params = pooling_params.clone()
            compressed = compress_token_type_ids(token_type_ids)
            params.extra_kwargs = {"compressed_token_type_ids": compressed}
            pooling_params_list.append(params)
        else:
            pooling_params_list.append(pooling_params)

        prompts.append(engine_prompt)

    self._validate_and_add_requests(
        prompts=prompts,
        params=pooling_params_list,
        use_tqdm=use_tqdm,
        lora_request=lora_request,
    )

    outputs = self._run_engine(use_tqdm=use_tqdm)
    items = self.engine_class.validate_outputs(outputs,
                                               PoolingRequestOutput)

    return [ScoringRequestOutput.from_base(item) for item in items]

_embedding_score

_embedding_score(
    tokenizer: AnyTokenizer,
    text_1: list[Union[str, TextPrompt, TokensPrompt]],
    text_2: list[Union[str, TextPrompt, TokensPrompt]],
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[PoolingParams] = None,
    lora_request: Optional[
        Union[list[LoRARequest], LoRARequest]
    ] = None,
) -> list[ScoringRequestOutput]
Source code in vllm/entrypoints/llm.py
def _embedding_score(
    self,
    tokenizer: AnyTokenizer,
    text_1: list[Union[str, TextPrompt, TokensPrompt]],
    text_2: list[Union[str, TextPrompt, TokensPrompt]],
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[PoolingParams] = None,
    lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
) -> list[ScoringRequestOutput]:

    encoded_output: list[PoolingRequestOutput] = self.encode(
        text_1 + text_2,
        truncate_prompt_tokens=truncate_prompt_tokens,
        use_tqdm=use_tqdm,
        lora_request=lora_request,
        pooling_params=pooling_params,
        pooling_task="embed",
    )

    encoded_output_1: list[PoolingRequestOutput] = encoded_output[
        0:len(text_1)]
    encoded_output_2: list[PoolingRequestOutput] = encoded_output[
        len(text_1):]

    if len(encoded_output_1) == 1:
        encoded_output_1 = encoded_output_1 * len(encoded_output_2)

    scores = _cosine_similarity(tokenizer=tokenizer,
                                embed_1=encoded_output_1,
                                embed_2=encoded_output_2)

    items = self.engine_class.validate_outputs(scores,
                                               PoolingRequestOutput)
    return [ScoringRequestOutput.from_base(item) for item in items]

_get_beam_search_lora_requests

_get_beam_search_lora_requests(
    lora_request: Optional[
        Union[list[LoRARequest], LoRARequest]
    ],
    prompts: list[Union[TokensPrompt, TextPrompt]],
) -> list[Optional[LoRARequest]]

Get the optional lora request corresponding to each prompt.

Source code in vllm/entrypoints/llm.py
def _get_beam_search_lora_requests(
    self,
    lora_request: Optional[Union[list[LoRARequest], LoRARequest]],
    prompts: list[Union[TokensPrompt, TextPrompt]],
) -> list[Optional[LoRARequest]]:
    """Get the optional lora request corresponding to each prompt."""
    if isinstance(lora_request,
                  Sequence) and len(lora_request) != len(prompts):
        raise ValueError(
            "Lora request list should be the same length as the prompts")

    if lora_request is None or isinstance(lora_request, LoRARequest):
        return [lora_request] * len(prompts)

    raise TypeError(f"Invalid lora_request type {type(lora_request)}")

_get_modality_specific_lora_reqs

_get_modality_specific_lora_reqs(
    prompts: Union[PromptType, Sequence[PromptType]],
    lora_request: Optional[
        Union[list[LoRARequest], LoRARequest]
    ],
)
Source code in vllm/entrypoints/llm.py
def _get_modality_specific_lora_reqs(
        self, prompts: Union[PromptType, Sequence[PromptType]],
        lora_request: Optional[Union[list[LoRARequest], LoRARequest]]):
    # Grab the lora config off the vllm config on the engine,
    # since this is the same for both v0 & v1.
    lora_config = self.llm_engine.vllm_config.lora_config

    # If there's no lora config / default_mm_loras, or the model
    # isn't multimodal, leave the lora as is.
    if (lora_config is None
            or not self.llm_engine.model_config.is_multimodal_model
            or (lora_config and lora_config.default_mm_loras is None)):
        return lora_request

    if not isinstance(prompts, Sequence):
        prompts = [prompts]

    optional_loras = ([lora_request] * len(prompts)
                      if not isinstance(lora_request, Sequence) else
                      lora_request)

    return [
        self._resolve_single_prompt_mm_lora(
            prompt,
            opt_lora_req,
            lora_config.default_mm_loras,
        ) for prompt, opt_lora_req in zip(prompts, optional_loras)
    ]

_resolve_single_prompt_mm_lora

_resolve_single_prompt_mm_lora(
    prompt: PromptType,
    lora_request: Optional[LoRARequest],
    default_mm_loras: Optional[dict[str, str]],
)
Source code in vllm/entrypoints/llm.py
def _resolve_single_prompt_mm_lora(self, prompt: PromptType,
                                   lora_request: Optional[LoRARequest],
                                   default_mm_loras: Optional[dict[str,
                                                                   str]]):
    if (not default_mm_loras or not isinstance(prompt, dict)
            or "multi_modal_data" not in prompt):
        return lora_request

    prompt = cast(Union[TextPrompt, TokensPrompt], prompt)

    intersection = set(prompt["multi_modal_data"].keys()) \
        .intersection(default_mm_loras.keys())
    if not intersection:
        return lora_request
    if len(intersection) > 1:
        # TODO: Would be nice to be able to have multiple loras per prompt
        logger.warning(
            "Multiple modality specific loras were registered and would be"
            " used by a single prompt consuming several modalities; "
            " currently we only support one lora per request; as such,"
            " lora(s) registered with modalities: %s"
            " will be skipped", intersection)
        return lora_request

    # Build the LoRA request; the ID of the default mm lora is the
    # index of the modality name sorted alphabetically + 1.
    modality_name = intersection.pop()
    modality_lora_path = default_mm_loras[modality_name]
    modality_lora_id = sorted(default_mm_loras).index(modality_name) + 1

    # If we have a collision, warn if there is a collision,
    # but always send the explicitly provided request.
    if lora_request:
        if lora_request.lora_int_id != modality_lora_id:
            logger.warning(
                "A modality with a registered lora and a lora_request "
                "with a different ID were provided; falling back to the "
                "lora_request as we only apply one LoRARequest per prompt")
        return lora_request

    return LoRARequest(
        modality_name,
        modality_lora_id,
        modality_lora_path,
    )

_run_engine

_run_engine(
    *, use_tqdm: Union[bool, Callable[..., tqdm]] = True
) -> list[Union[RequestOutput, PoolingRequestOutput]]
Source code in vllm/entrypoints/llm.py
def _run_engine(
    self,
    *,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True
) -> list[Union[RequestOutput, PoolingRequestOutput]]:
    # Initialize tqdm.
    if use_tqdm:
        num_requests = self.llm_engine.get_num_unfinished_requests()
        tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
        pbar = tqdm_func(
            total=num_requests,
            desc="Processed prompts",
            dynamic_ncols=True,
            postfix=(f"est. speed input: {0:.2f} toks/s, "
                     f"output: {0:.2f} toks/s"),
        )

    # Run the engine.
    outputs: list[Union[RequestOutput, PoolingRequestOutput]] = []
    total_in_toks = 0
    total_out_toks = 0
    while self.llm_engine.has_unfinished_requests():
        step_outputs = self.llm_engine.step()
        for output in step_outputs:
            if output.finished:
                outputs.append(output)
                if use_tqdm:
                    if isinstance(output, RequestOutput):
                        # Calculate tokens only for RequestOutput
                        n = len(output.outputs)
                        assert output.prompt_token_ids is not None
                        total_in_toks += len(output.prompt_token_ids) * n
                        in_spd = total_in_toks / pbar.format_dict["elapsed"]
                        total_out_toks += sum(
                            len(stp.token_ids) for stp in output.outputs)
                        out_spd = (total_out_toks /
                                   pbar.format_dict["elapsed"])
                        pbar.postfix = (
                            f"est. speed input: {in_spd:.2f} toks/s, "
                            f"output: {out_spd:.2f} toks/s")
                        pbar.update(n)
                    else:
                        pbar.update(1)
                    if pbar.n == num_requests:
                        pbar.refresh()

    if use_tqdm:
        pbar.close()
    # Sort the outputs by request ID.
    # This is necessary because some requests may be finished earlier than
    # its previous requests.
    return sorted(outputs, key=lambda x: int(x.request_id))

_validate_and_add_requests

_validate_and_add_requests(
    prompts: Union[PromptType, Sequence[PromptType]],
    params: Union[
        SamplingParams,
        Sequence[SamplingParams],
        PoolingParams,
        Sequence[PoolingParams],
    ],
    *,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    lora_request: Optional[
        Union[Sequence[LoRARequest], LoRARequest]
    ],
    tokenization_kwargs: Optional[dict[str, Any]] = None,
    priority: Optional[list[int]] = None,
) -> None
Source code in vllm/entrypoints/llm.py
def _validate_and_add_requests(
    self,
    prompts: Union[PromptType, Sequence[PromptType]],
    params: Union[SamplingParams, Sequence[SamplingParams], PoolingParams,
                  Sequence[PoolingParams]],
    *,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
    tokenization_kwargs: Optional[dict[str, Any]] = None,
    priority: Optional[list[int]] = None,
) -> None:
    if isinstance(prompts, (str, dict)):
        # Convert a single prompt to a list.
        prompts = [prompts]

    num_requests = len(prompts)
    if isinstance(params, Sequence) and len(params) != num_requests:
        raise ValueError("The lengths of prompts and params "
                         "must be the same.")
    if isinstance(lora_request,
                  Sequence) and len(lora_request) != num_requests:
        raise ValueError("The lengths of prompts and lora_request "
                         "must be the same.")

    for sp in params if isinstance(params, Sequence) else (params, ):
        if isinstance(sp, SamplingParams):
            # We only care about the final output
            sp.output_kind = RequestOutputKind.FINAL_ONLY

    # Add requests to the engine.
    it = prompts
    if use_tqdm:
        tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
        it = tqdm_func(it, desc="Adding requests")

    for i, prompt in enumerate(it):
        self._add_request(
            prompt,
            params[i] if isinstance(params, Sequence) else params,
            tokenization_kwargs=tokenization_kwargs,
            lora_request=lora_request[i] if isinstance(
                lora_request, Sequence) else lora_request,
            priority=priority[i] if priority else 0,
        )

apply_model

apply_model(func: Callable[[Module], _R]) -> list[_R]

Run a function directly on the model inside each worker, returning the result for each of them.

Source code in vllm/entrypoints/llm.py
def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
    """
    Run a function directly on the model inside each worker,
    returning the result for each of them.
    """
    executor = self.llm_engine.model_executor
    return executor.apply_model(func)
beam_search(
    prompts: list[Union[TokensPrompt, TextPrompt]],
    params: BeamSearchParams,
    lora_request: Optional[
        Union[list[LoRARequest], LoRARequest]
    ] = None,
    use_tqdm: bool = False,
) -> list[BeamSearchOutput]

Generate sequences using beam search.

Parameters:

Name Type Description Default
prompts list[Union[TokensPrompt, TextPrompt]]

A list of prompts. Each prompt can be a string or a list of token IDs.

required
params BeamSearchParams

The beam search parameters.

required
lora_request Optional[Union[list[LoRARequest], LoRARequest]]

LoRA request to use for generation, if any.

None
use_tqdm bool

Whether to use tqdm to display the progress bar.

False
Source code in vllm/entrypoints/llm.py
def beam_search(
    self,
    prompts: list[Union[TokensPrompt, TextPrompt]],
    params: BeamSearchParams,
    lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
    use_tqdm: bool = False,
) -> list[BeamSearchOutput]:
    """
    Generate sequences using beam search.

    Args:
        prompts: A list of prompts. Each prompt can be a string or a list
            of token IDs.
        params: The beam search parameters.
        lora_request: LoRA request to use for generation, if any.
        use_tqdm: Whether to use tqdm to display the progress bar.
    """
    # TODO: how does beam search work together with length penalty,
    # frequency, penalty, and stopping criteria, etc.?
    beam_width = params.beam_width
    max_tokens = params.max_tokens
    temperature = params.temperature
    ignore_eos = params.ignore_eos
    length_penalty = params.length_penalty

    lora_requests = self._get_beam_search_lora_requests(
        lora_request, prompts)

    tokenizer = self.get_tokenizer()
    sort_beams_key = create_sort_beams_key_function(
        tokenizer.eos_token_id,
        length_penalty,
    )

    def create_tokens_prompt_from_beam(
            beam: BeamSearchSequence) -> TokensPrompt:
        token_prompt_kwargs: TokensPrompt = {
            "prompt_token_ids": beam.tokens
        }
        if beam.multi_modal_data is not None:
            token_prompt_kwargs["multi_modal_data"] = beam.multi_modal_data

        if beam.mm_processor_kwargs is not None:
            token_prompt_kwargs[
                "mm_processor_kwargs"] = beam.mm_processor_kwargs
        return TokensPrompt(**token_prompt_kwargs)

    # generate 2 * beam_width candidates at each step
    # following the huggingface transformers implementation
    # at https://github.com/huggingface/transformers/blob/e15687fffe5c9d20598a19aeab721ae0a7580f8a/src/transformers/generation/beam_search.py#L534 # noqa
    beam_search_params = SamplingParams(logprobs=2 * beam_width,
                                        max_tokens=1,
                                        temperature=temperature)
    instances: list[BeamSearchInstance] = []

    for lora_req, prompt in zip(lora_requests, prompts):
        # Add multimodal processor kwargs & data
        mm_kwargs = {}
        if "multi_modal_data" in prompt:
            mm_kwargs["multi_modal_data"] = prompt["multi_modal_data"]
        if "mm_processor_kwargs" in prompt:
            mm_kwargs["mm_processor_kwargs"] = prompt[
                "mm_processor_kwargs"]

        if "prompt_token_ids" in prompt:
            prompt = cast(TokensPrompt, prompt)  # Needed for mypy
            prompt_tokens = prompt["prompt_token_ids"]
        else:
            prompt_tokens = tokenizer.encode(prompt["prompt"])

        instances.append(
            BeamSearchInstance(
                prompt_tokens,
                lora_request=lora_req,
                logprobs=None,
                **mm_kwargs,
            ), )

    token_iter = range(max_tokens)
    if use_tqdm:
        token_iter = tqdm(token_iter,
                          desc="Beam search",
                          unit="token",
                          unit_scale=False)
        logger.warning(
            "The progress bar shows the upper bound on token steps and "
            "may finish early due to stopping conditions. It does not "
            "reflect instance-level progress.")

    for _ in token_iter:
        all_beams: list[BeamSearchSequence] = list(
            sum((instance.beams for instance in instances), []))
        pos = [0] + list(
            itertools.accumulate(
                len(instance.beams) for instance in instances))
        instance_start_and_end: list[tuple[int, int]] = list(
            zip(pos[:-1], pos[1:]))

        if len(all_beams) == 0:
            break

        # create the corresponding batch entries for prompt & optional lora
        prompts_batch, lora_req_batch = zip(
            *[(create_tokens_prompt_from_beam(beam), beam.lora_request)
              for beam in all_beams])

        # only runs for one step
        # we don't need to use tqdm here
        output = self.generate(prompts_batch,
                               sampling_params=beam_search_params,
                               use_tqdm=False,
                               lora_request=lora_req_batch)

        for (start, end), instance in zip(instance_start_and_end,
                                          instances):
            instance_new_beams = []
            for i in range(start, end):
                current_beam = all_beams[i]
                result = output[i]

                if result.outputs[0].logprobs is not None:
                    # if `result.outputs[0].logprobs` is None, it means
                    # the sequence is completed because of the max-model-len
                    # or abortion. we don't need to add it to the new beams.
                    logprobs = result.outputs[0].logprobs[0]
                    for token_id, logprob_obj in logprobs.items():
                        new_beam = BeamSearchSequence(
                            tokens=current_beam.tokens + [token_id],
                            logprobs=current_beam.logprobs + [logprobs],
                            lora_request=current_beam.lora_request,
                            cum_logprob=current_beam.cum_logprob +
                            logprob_obj.logprob,
                            multi_modal_data=current_beam.multi_modal_data,
                            mm_processor_kwargs=current_beam.
                            mm_processor_kwargs)

                        if token_id == tokenizer.eos_token_id and \
                            not ignore_eos:
                            instance.completed.append(new_beam)
                        else:
                            instance_new_beams.append(new_beam)
            sorted_beams = sorted(instance_new_beams,
                                  key=sort_beams_key,
                                  reverse=True)
            instance.beams = sorted_beams[:beam_width]

    outputs = []
    for instance in instances:
        instance.completed.extend(instance.beams)
        sorted_completed = sorted(instance.completed,
                                  key=sort_beams_key,
                                  reverse=True)
        best_beams = sorted_completed[:beam_width]

        for beam in best_beams:
            beam.text = tokenizer.decode(beam.tokens)
        outputs.append(BeamSearchOutput(sequences=best_beams))

    return outputs

chat

chat(
    messages: Union[
        list[ChatCompletionMessageParam],
        list[list[ChatCompletionMessageParam]],
    ],
    sampling_params: Optional[
        Union[SamplingParams, list[SamplingParams]]
    ] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    lora_request: Optional[LoRARequest] = None,
    chat_template: Optional[str] = None,
    chat_template_content_format: ChatTemplateContentFormatOption = "auto",
    add_generation_prompt: bool = True,
    continue_final_message: bool = False,
    tools: Optional[list[dict[str, Any]]] = None,
    chat_template_kwargs: Optional[dict[str, Any]] = None,
    mm_processor_kwargs: Optional[dict[str, Any]] = None,
) -> list[RequestOutput]

Generate responses for a chat conversation.

The chat conversation is converted into a text prompt using the tokenizer and calls the [generate][] method to generate the responses.

Multi-modal inputs can be passed in the same way you would pass them to the OpenAI API.

Parameters:

Name Type Description Default
messages Union[list[ChatCompletionMessageParam], list[list[ChatCompletionMessageParam]]]

A list of conversations or a single conversation.

  • Each conversation is represented as a list of messages.
  • Each message is a dictionary with 'role' and 'content' keys.
required
sampling_params Optional[Union[SamplingParams, list[SamplingParams]]]

The sampling parameters for text generation. If None, we use the default sampling parameters. When it is a single value, it is applied to every prompt. When it is a list, the list must have the same length as the prompts and it is paired one by one with the prompt.

None
use_tqdm Union[bool, Callable[..., tqdm]]

If True, shows a tqdm progress bar. If a callable (e.g., functools.partial(tqdm, leave=False)), it is used to create the progress bar. If False, no progress bar is created.

True
lora_request Optional[LoRARequest]

LoRA request to use for generation, if any.

None
chat_template Optional[str]

The template to use for structuring the chat. If not provided, the model's default chat template will be used.

None
chat_template_content_format ChatTemplateContentFormatOption

The format to render message content.

  • "string" will render the content as a string. Example: "Who are you?"
  • "openai" will render the content as a list of dictionaries, similar to OpenAI schema. Example: [{"type": "text", "text": "Who are you?"}]
'auto'
add_generation_prompt bool

If True, adds a generation template to each message.

True
continue_final_message bool

If True, continues the final message in the conversation instead of starting a new one. Cannot be True if add_generation_prompt is also True.

False
chat_template_kwargs Optional[dict[str, Any]]

Additional kwargs to pass to the chat template.

None
mm_processor_kwargs Optional[dict[str, Any]]

Multimodal processor kwarg overrides for this chat request. Only used for offline requests.

None

Returns:

Type Description
list[RequestOutput]

A list of RequestOutput objects containing the generated

list[RequestOutput]

responses in the same order as the input messages.

Source code in vllm/entrypoints/llm.py
def chat(
    self,
    messages: Union[list[ChatCompletionMessageParam],
                    list[list[ChatCompletionMessageParam]]],
    sampling_params: Optional[Union[SamplingParams,
                                    list[SamplingParams]]] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    lora_request: Optional[LoRARequest] = None,
    chat_template: Optional[str] = None,
    chat_template_content_format: ChatTemplateContentFormatOption = "auto",
    add_generation_prompt: bool = True,
    continue_final_message: bool = False,
    tools: Optional[list[dict[str, Any]]] = None,
    chat_template_kwargs: Optional[dict[str, Any]] = None,
    mm_processor_kwargs: Optional[dict[str, Any]] = None,
) -> list[RequestOutput]:
    """
    Generate responses for a chat conversation.

    The chat conversation is converted into a text prompt using the
    tokenizer and calls the [generate][] method to generate the
    responses.

    Multi-modal inputs can be passed in the same way you would pass them
    to the OpenAI API.

    Args:
        messages: A list of conversations or a single conversation.

            - Each conversation is represented as a list of messages.
            - Each message is a dictionary with 'role' and 'content' keys.

        sampling_params: The sampling parameters for text generation.
            If None, we use the default sampling parameters. When it
            is a single value, it is applied to every prompt. When it
            is a list, the list must have the same length as the
            prompts and it is paired one by one with the prompt.
        use_tqdm: If `True`, shows a tqdm progress bar.
            If a callable (e.g., `functools.partial(tqdm, leave=False)`),
            it is used to create the progress bar.
            If `False`, no progress bar is created.
        lora_request: LoRA request to use for generation, if any.
        chat_template: The template to use for structuring the chat.
            If not provided, the model's default chat template will be used.
        chat_template_content_format: The format to render message content.

            - "string" will render the content as a string.
              Example: `"Who are you?"`
            - "openai" will render the content as a list of dictionaries,
              similar to OpenAI schema.
              Example: `[{"type": "text", "text": "Who are you?"}]`

        add_generation_prompt: If True, adds a generation template
            to each message.
        continue_final_message: If True, continues the final message in
            the conversation instead of starting a new one. Cannot be
            `True` if `add_generation_prompt` is also `True`.
        chat_template_kwargs: Additional kwargs to pass to the chat
            template.
        mm_processor_kwargs: Multimodal processor kwarg overrides for this
            chat request. Only used for offline requests.

    Returns:
        A list of `RequestOutput` objects containing the generated
        responses in the same order as the input messages.
    """
    list_of_messages: list[list[ChatCompletionMessageParam]]

    # Handle multi and single conversations
    if is_list_of(messages, list):
        # messages is list[list[...]]
        list_of_messages = cast(list[list[ChatCompletionMessageParam]],
                                messages)
    else:
        # messages is list[...]
        list_of_messages = [
            cast(list[ChatCompletionMessageParam], messages)
        ]

    tokenizer = self.get_tokenizer(lora_request)
    model_config = self.llm_engine.get_model_config()
    resolved_content_format = resolve_chat_template_content_format(
        chat_template,
        tools,
        chat_template_content_format,
        tokenizer,
        model_config=model_config,
    )

    _chat_template_kwargs: dict[str, Any] = dict(
        chat_template=chat_template,
        add_generation_prompt=add_generation_prompt,
        continue_final_message=continue_final_message,
        tools=tools,
    )
    _chat_template_kwargs.update(chat_template_kwargs or {})

    prompts: list[Union[TokensPrompt, TextPrompt]] = []

    for msgs in list_of_messages:
        # NOTE: _parse_chat_message_content_parts() currently doesn't
        # handle mm_processor_kwargs, since there is no implementation in
        # the chat message parsing for it.
        conversation, mm_data = parse_chat_messages(
            msgs,
            model_config,
            tokenizer,
            content_format=resolved_content_format,
        )

        if isinstance(tokenizer, MistralTokenizer):
            prompt_token_ids = apply_mistral_chat_template(
                tokenizer,
                messages=msgs,
                **_chat_template_kwargs,
            )
        else:
            prompt_str = apply_hf_chat_template(
                tokenizer=tokenizer,
                conversation=conversation,
                model_config=model_config,
                **_chat_template_kwargs,
            )
            # Special tokens are already included in chat templates so
            # should not be added by the tokenizer in this case.
            prompt_token_ids = tokenizer.encode(prompt_str,
                                                add_special_tokens=False)

        prompt = TokensPrompt(prompt_token_ids=prompt_token_ids)

        if mm_data is not None:
            prompt["multi_modal_data"] = mm_data

        if mm_processor_kwargs is not None:
            prompt["mm_processor_kwargs"] = mm_processor_kwargs

        prompts.append(prompt)

    return self.generate(
        prompts,
        sampling_params=sampling_params,
        use_tqdm=use_tqdm,
        lora_request=lora_request,
    )

classify

classify(
    prompts: Union[PromptType, Sequence[PromptType]],
    *,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[
        Union[PoolingParams, Sequence[PoolingParams]]
    ] = None,
    lora_request: Optional[
        Union[list[LoRARequest], LoRARequest]
    ] = None,
) -> list[ClassificationRequestOutput]

Generate class logits for each prompt.

This class automatically batches the given prompts, considering the memory constraint. For the best performance, put all of your prompts into a single list and pass it to this method.

Parameters:

Name Type Description Default
prompts Union[PromptType, Sequence[PromptType]]

The prompts to the LLM. You may pass a sequence of prompts for batch inference. See PromptType for more details about the format of each prompts.

required
use_tqdm Union[bool, Callable[..., tqdm]]

If True, shows a tqdm progress bar. If a callable (e.g., functools.partial(tqdm, leave=False)), it is used to create the progress bar. If False, no progress bar is created.

True
lora_request Optional[Union[list[LoRARequest], LoRARequest]]

LoRA request to use for generation, if any.

None
pooling_params Optional[Union[PoolingParams, Sequence[PoolingParams]]]

The pooling parameters for pooling. If None, we use the default pooling parameters.

None

Returns: A list of ClassificationRequestOutput objects containing the embedding vectors in the same order as the input prompts.

Source code in vllm/entrypoints/llm.py
def classify(
    self,
    prompts: Union[PromptType, Sequence[PromptType]],
    *,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[Union[PoolingParams,
                                   Sequence[PoolingParams]]] = None,
    lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
) -> list[ClassificationRequestOutput]:
    """
    Generate class logits for each prompt.

    This class automatically batches the given prompts, considering
    the memory constraint. For the best performance, put all of your prompts
    into a single list and pass it to this method.

    Args:
        prompts: The prompts to the LLM. You may pass a sequence of prompts
            for batch inference. See [PromptType][vllm.inputs.PromptType]
            for more details about the format of each prompts.
        use_tqdm: If `True`, shows a tqdm progress bar.
            If a callable (e.g., `functools.partial(tqdm, leave=False)`),
            it is used to create the progress bar.
            If `False`, no progress bar is created.
        lora_request: LoRA request to use for generation, if any.
        pooling_params: The pooling parameters for pooling. If None, we
            use the default pooling parameters.
    Returns:
        A list of `ClassificationRequestOutput` objects containing the
        embedding vectors in the same order as the input prompts.
    """
    if "classify" not in self.supported_tasks:
        raise ValueError(
            "Classification API is not supported by this model. "
            "Try converting the model using `--convert classify`.")

    items = self.encode(
        prompts,
        use_tqdm=use_tqdm,
        pooling_params=pooling_params,
        lora_request=lora_request,
        pooling_task="classify",
    )

    return [ClassificationRequestOutput.from_base(item) for item in items]

collective_rpc

collective_rpc(
    method: Union[str, Callable[..., _R]],
    timeout: Optional[float] = None,
    args: tuple = (),
    kwargs: Optional[dict[str, Any]] = None,
) -> list[_R]

Execute an RPC call on all workers.

Parameters:

Name Type Description Default
method Union[str, Callable[..., _R]]

Name of the worker method to execute, or a callable that is serialized and sent to all workers to execute.

If the method is a callable, it should accept an additional self argument, in addition to the arguments passed in args and kwargs. The self argument will be the worker object.

required
timeout Optional[float]

Maximum time in seconds to wait for execution. Raises a TimeoutError on timeout. None means wait indefinitely.

None
args tuple

Positional arguments to pass to the worker method.

()
kwargs Optional[dict[str, Any]]

Keyword arguments to pass to the worker method.

None

Returns:

Type Description
list[_R]

A list containing the results from each worker.

Note

It is recommended to use this API to only pass control messages, and set up data-plane communication to pass data.

Source code in vllm/entrypoints/llm.py
def collective_rpc(self,
                   method: Union[str, Callable[..., _R]],
                   timeout: Optional[float] = None,
                   args: tuple = (),
                   kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
    """
    Execute an RPC call on all workers.

    Args:
        method: Name of the worker method to execute, or a callable that
            is serialized and sent to all workers to execute.

            If the method is a callable, it should accept an additional
            `self` argument, in addition to the arguments passed in `args`
            and `kwargs`. The `self` argument will be the worker object.
        timeout: Maximum time in seconds to wait for execution. Raises a
            [`TimeoutError`][] on timeout. `None` means wait indefinitely.
        args: Positional arguments to pass to the worker method.
        kwargs: Keyword arguments to pass to the worker method.

    Returns:
        A list containing the results from each worker.

    Note:
        It is recommended to use this API to only pass control messages,
        and set up data-plane communication to pass data.
    """

    return self.llm_engine.collective_rpc(method, timeout, args, kwargs)

embed

embed(
    prompts: Union[PromptType, Sequence[PromptType]],
    *,
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[
        Union[PoolingParams, Sequence[PoolingParams]]
    ] = None,
    lora_request: Optional[
        Union[list[LoRARequest], LoRARequest]
    ] = None,
) -> list[EmbeddingRequestOutput]

Generate an embedding vector for each prompt.

This class automatically batches the given prompts, considering the memory constraint. For the best performance, put all of your prompts into a single list and pass it to this method.

Parameters:

Name Type Description Default
prompts Union[PromptType, Sequence[PromptType]]

The prompts to the LLM. You may pass a sequence of prompts for batch inference. See PromptType for more details about the format of each prompts.

required
pooling_params Optional[Union[PoolingParams, Sequence[PoolingParams]]]

The pooling parameters for pooling. If None, we use the default pooling parameters.

None
use_tqdm Union[bool, Callable[..., tqdm]]

If True, shows a tqdm progress bar. If a callable (e.g., functools.partial(tqdm, leave=False)), it is used to create the progress bar. If False, no progress bar is created.

True
lora_request Optional[Union[list[LoRARequest], LoRARequest]]

LoRA request to use for generation, if any.

None

Returns:

Type Description
list[EmbeddingRequestOutput]

A list of EmbeddingRequestOutput objects containing the

list[EmbeddingRequestOutput]

embedding vectors in the same order as the input prompts.

Source code in vllm/entrypoints/llm.py
def embed(
    self,
    prompts: Union[PromptType, Sequence[PromptType]],
    *,
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[Union[PoolingParams,
                                   Sequence[PoolingParams]]] = None,
    lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
) -> list[EmbeddingRequestOutput]:
    """
    Generate an embedding vector for each prompt.

    This class automatically batches the given prompts, considering
    the memory constraint. For the best performance, put all of your prompts
    into a single list and pass it to this method.

    Args:
        prompts: The prompts to the LLM. You may pass a sequence of prompts
            for batch inference. See [PromptType][vllm.inputs.PromptType]
            for more details about the format of each prompts.
        pooling_params: The pooling parameters for pooling. If None, we
            use the default pooling parameters.
        use_tqdm: If `True`, shows a tqdm progress bar.
            If a callable (e.g., `functools.partial(tqdm, leave=False)`),
            it is used to create the progress bar.
            If `False`, no progress bar is created.
        lora_request: LoRA request to use for generation, if any.

    Returns:
        A list of `EmbeddingRequestOutput` objects containing the
        embedding vectors in the same order as the input prompts.
    """
    if "embed" not in self.supported_tasks:
        raise ValueError(
            "Embedding API is not supported by this model. "
            "Try converting the model using `--convert embed`.")

    items = self.encode(
        prompts,
        truncate_prompt_tokens=truncate_prompt_tokens,
        use_tqdm=use_tqdm,
        pooling_params=pooling_params,
        lora_request=lora_request,
        pooling_task="embed",
    )

    return [EmbeddingRequestOutput.from_base(item) for item in items]

encode

encode(
    prompts: Union[PromptType, Sequence[PromptType]],
    pooling_params: Optional[
        Union[PoolingParams, Sequence[PoolingParams]]
    ] = None,
    *,
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    lora_request: Optional[
        Union[list[LoRARequest], LoRARequest]
    ] = None,
    pooling_task: PoolingTask = "encode",
    tokenization_kwargs: Optional[dict[str, Any]] = None,
) -> list[PoolingRequestOutput]

Apply pooling to the hidden states corresponding to the input prompts.

This class automatically batches the given prompts, considering the memory constraint. For the best performance, put all of your prompts into a single list and pass it to this method.

Parameters:

Name Type Description Default
prompts Union[PromptType, Sequence[PromptType]]

The prompts to the LLM. You may pass a sequence of prompts for batch inference. See PromptType for more details about the format of each prompts.

required
pooling_params Optional[Union[PoolingParams, Sequence[PoolingParams]]]

The pooling parameters for pooling. If None, we use the default pooling parameters.

None
use_tqdm Union[bool, Callable[..., tqdm]]

If True, shows a tqdm progress bar. If a callable (e.g., functools.partial(tqdm, leave=False)), it is used to create the progress bar. If False, no progress bar is created.

True
lora_request Optional[Union[list[LoRARequest], LoRARequest]]

LoRA request to use for generation, if any.

None
pooling_task PoolingTask

Override the pooling task to use.

'encode'

Returns:

Type Description
list[PoolingRequestOutput]

A list of PoolingRequestOutput objects containing the

list[PoolingRequestOutput]

pooled hidden states in the same order as the input prompts.

Note

Using prompts and prompt_token_ids as keyword parameters is considered legacy and may be deprecated in the future. You should instead pass them via the inputs parameter.

Source code in vllm/entrypoints/llm.py
def encode(
    self,
    prompts: Union[PromptType, Sequence[PromptType]],
    pooling_params: Optional[Union[PoolingParams,
                                   Sequence[PoolingParams]]] = None,
    *,
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
    pooling_task: PoolingTask = "encode",
    tokenization_kwargs: Optional[dict[str, Any]] = None,
) -> list[PoolingRequestOutput]:
    """Apply pooling to the hidden states corresponding to the input
    prompts.

    This class automatically batches the given prompts, considering
    the memory constraint. For the best performance, put all of your prompts
    into a single list and pass it to this method.

    Args:
        prompts: The prompts to the LLM. You may pass a sequence of prompts
            for batch inference. See [PromptType][vllm.inputs.PromptType]
            for more details about the format of each prompts.
        pooling_params: The pooling parameters for pooling. If None, we
            use the default pooling parameters.
        use_tqdm: If `True`, shows a tqdm progress bar.
            If a callable (e.g., `functools.partial(tqdm, leave=False)`),
            it is used to create the progress bar.
            If `False`, no progress bar is created.
        lora_request: LoRA request to use for generation, if any.
        pooling_task: Override the pooling task to use.

    Returns:
        A list of `PoolingRequestOutput` objects containing the
        pooled hidden states in the same order as the input prompts.

    Note:
        Using `prompts` and `prompt_token_ids` as keyword parameters is
        considered legacy and may be deprecated in the future. You should
        instead pass them via the `inputs` parameter.
    """
    if pooling_task is None:
        if "embed" in self.supported_tasks:
            pooling_task = "embed"
        else:
            pooling_task = "encode"

        logger.warning_once(
            "`LLM.encode` is currently using `pooling_task = %s`.\n"
            "Please use one of the more specific methods or set the "
            "task directly when using `LLM.encode`:\n"
            "  - For embeddings, use `LLM.embed(...)` "
            "or `pooling_task=\"embed\"`.\n"
            "  - For classification logits, use `LLM.classify(...)` "
            "or `pooling_task=\"classify\"`.\n"
            "  - For rewards, use `LLM.reward(...)` "
            "or `pooling_task=\"reward\"`\n"
            "  - For similarity scores, use `LLM.score(...)`.",
            pooling_task)

    model_config = self.llm_engine.model_config
    runner_type = model_config.runner_type
    if runner_type != "pooling":
        raise ValueError(
            "LLM.encode() is only supported for pooling models. "
            "Try passing `--runner pooling` to use the model as a "
            "pooling model.")

    if pooling_task not in self.supported_tasks:
        raise ValueError(
            f"pooling_task must be one of {self.supported_tasks}.")

    if pooling_params is None:
        # Use default pooling params.
        pooling_params = PoolingParams()

    if isinstance(pooling_params, PoolingParams):
        pooling_params.verify(pooling_task, model_config)
    else:
        for pooling_param in pooling_params:
            pooling_param.verify(pooling_task, model_config)

    if tokenization_kwargs is None:
        tokenization_kwargs = dict[str, Any]()
        _validate_truncation_size(model_config.max_model_len,
                                  truncate_prompt_tokens,
                                  tokenization_kwargs)

    self._validate_and_add_requests(
        prompts=prompts,
        params=pooling_params,
        use_tqdm=use_tqdm,
        lora_request=lora_request,
        tokenization_kwargs=tokenization_kwargs,
    )

    outputs = self._run_engine(use_tqdm=use_tqdm)
    return self.engine_class.validate_outputs(outputs,
                                              PoolingRequestOutput)

generate

generate(
    prompts: Union[PromptType, Sequence[PromptType]],
    sampling_params: Optional[
        Union[SamplingParams, Sequence[SamplingParams]]
    ] = None,
    *,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    lora_request: Optional[
        Union[list[LoRARequest], LoRARequest]
    ] = None,
    priority: Optional[list[int]] = None,
) -> list[RequestOutput]

Generates the completions for the input prompts.

This class automatically batches the given prompts, considering the memory constraint. For the best performance, put all of your prompts into a single list and pass it to this method.

Parameters:

Name Type Description Default
prompts Union[PromptType, Sequence[PromptType]]

The prompts to the LLM. You may pass a sequence of prompts for batch inference. See PromptType for more details about the format of each prompts.

required
sampling_params Optional[Union[SamplingParams, Sequence[SamplingParams]]]

The sampling parameters for text generation. If None, we use the default sampling parameters. When it is a single value, it is applied to every prompt. When it is a list, the list must have the same length as the prompts and it is paired one by one with the prompt.

None
use_tqdm Union[bool, Callable[..., tqdm]]

If True, shows a tqdm progress bar. If a callable (e.g., functools.partial(tqdm, leave=False)), it is used to create the progress bar. If False, no progress bar is created.

True
lora_request Optional[Union[list[LoRARequest], LoRARequest]]

LoRA request to use for generation, if any.

None
priority Optional[list[int]]

The priority of the requests, if any. Only applicable when priority scheduling policy is enabled.

None

Returns:

Type Description
list[RequestOutput]

A list of RequestOutput objects containing the

list[RequestOutput]

generated completions in the same order as the input prompts.

Note

Using prompts and prompt_token_ids as keyword parameters is considered legacy and may be deprecated in the future. You should instead pass them via the inputs parameter.

Source code in vllm/entrypoints/llm.py
def generate(
    self,
    prompts: Union[PromptType, Sequence[PromptType]],
    sampling_params: Optional[Union[SamplingParams,
                                    Sequence[SamplingParams]]] = None,
    *,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
    priority: Optional[list[int]] = None,
) -> list[RequestOutput]:
    """Generates the completions for the input prompts.

    This class automatically batches the given prompts, considering
    the memory constraint. For the best performance, put all of your prompts
    into a single list and pass it to this method.

    Args:
        prompts: The prompts to the LLM. You may pass a sequence of prompts
            for batch inference. See [PromptType][vllm.inputs.PromptType]
            for more details about the format of each prompts.
        sampling_params: The sampling parameters for text generation. If
            None, we use the default sampling parameters.
            When it is a single value, it is applied to every prompt.
            When it is a list, the list must have the same length as the
            prompts and it is paired one by one with the prompt.
        use_tqdm: If `True`, shows a tqdm progress bar.
            If a callable (e.g., `functools.partial(tqdm, leave=False)`),
            it is used to create the progress bar.
            If `False`, no progress bar is created.
        lora_request: LoRA request to use for generation, if any.
        priority: The priority of the requests, if any.
            Only applicable when priority scheduling policy is enabled.

    Returns:
        A list of `RequestOutput` objects containing the
        generated completions in the same order as the input prompts.

    Note:
        Using `prompts` and `prompt_token_ids` as keyword parameters is
        considered legacy and may be deprecated in the future. You should
        instead pass them via the `inputs` parameter.
    """
    model_config = self.llm_engine.model_config
    runner_type = model_config.runner_type
    if runner_type != "generate":
        raise ValueError(
            "LLM.generate() is only supported for generative models. "
            "Try passing `--runner generate` to use the model as a "
            "generative model.")

    if sampling_params is None:
        # Use default sampling params.
        sampling_params = self.get_default_sampling_params()

    tokenization_kwargs: dict[str, Any] = {}
    truncate_prompt_tokens = None
    if isinstance(sampling_params, SamplingParams):
        truncate_prompt_tokens = sampling_params.truncate_prompt_tokens

    _validate_truncation_size(model_config.max_model_len,
                              truncate_prompt_tokens, tokenization_kwargs)

    # Add any modality specific loras to the corresponding prompts
    lora_request = self._get_modality_specific_lora_reqs(
        prompts, lora_request)

    self._validate_and_add_requests(
        prompts=prompts,
        params=sampling_params,
        use_tqdm=use_tqdm,
        lora_request=lora_request,
        tokenization_kwargs=tokenization_kwargs,
        priority=priority,
    )

    outputs = self._run_engine(use_tqdm=use_tqdm)
    return self.engine_class.validate_outputs(outputs, RequestOutput)

get_default_sampling_params

get_default_sampling_params() -> SamplingParams
Source code in vllm/entrypoints/llm.py
def get_default_sampling_params(self) -> SamplingParams:
    if self.default_sampling_params is None:
        self.default_sampling_params = (
            self.llm_engine.model_config.get_diff_sampling_param())
    if self.default_sampling_params:
        return SamplingParams.from_optional(**self.default_sampling_params)
    return SamplingParams()

get_metrics

get_metrics() -> list[Metric]

Return a snapshot of aggregated metrics from Prometheus.

Returns:

Type Description
list[Metric]

A MetricSnapshot instance capturing the current state

list[Metric]

of all aggregated metrics from Prometheus.

Note

This method is only available with the V1 LLM engine.

Source code in vllm/entrypoints/llm.py
def get_metrics(self) -> list["Metric"]:
    """Return a snapshot of aggregated metrics from Prometheus.

    Returns:
        A ``MetricSnapshot`` instance capturing the current state
        of all aggregated metrics from Prometheus.

    Note:
        This method is only available with the V1 LLM engine.
    """
    from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
    assert isinstance(self.llm_engine, V1LLMEngine)
    return self.llm_engine.get_metrics()

get_tokenizer

get_tokenizer(
    lora_request: Optional[LoRARequest] = None,
) -> AnyTokenizer
Source code in vllm/entrypoints/llm.py
def get_tokenizer(
    self,
    lora_request: Optional[LoRARequest] = None,
) -> AnyTokenizer:
    return self.llm_engine.get_tokenizer_group().get_lora_tokenizer(
        lora_request)

reset_prefix_cache

reset_prefix_cache(device: Optional[Device] = None) -> bool
Source code in vllm/entrypoints/llm.py
def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
    return self.llm_engine.reset_prefix_cache(device)

reward

reward(
    prompts: Union[PromptType, Sequence[PromptType]],
    /,
    *,
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[
        Union[PoolingParams, Sequence[PoolingParams]]
    ] = None,
    lora_request: Optional[
        Union[list[LoRARequest], LoRARequest]
    ] = None,
) -> list[PoolingRequestOutput]

Generate rewards for each prompt.

Parameters:

Name Type Description Default
prompts Union[PromptType, Sequence[PromptType]]

The prompts to the LLM. You may pass a sequence of prompts for batch inference. See PromptType for more details about the format of each prompts.

required
use_tqdm Union[bool, Callable[..., tqdm]]

If True, shows a tqdm progress bar. If a callable (e.g., functools.partial(tqdm, leave=False)), it is used to create the progress bar. If False, no progress bar is created.

True
lora_request Optional[Union[list[LoRARequest], LoRARequest]]

LoRA request to use for generation, if any.

None
pooling_params Optional[Union[PoolingParams, Sequence[PoolingParams]]]

The pooling parameters for pooling. If None, we use the default pooling parameters.

None

Returns: A list of PoolingRequestOutput objects containing the pooled hidden states in the same order as the input prompts.

Source code in vllm/entrypoints/llm.py
def reward(
    self,
    prompts: Union[PromptType, Sequence[PromptType]],
    /,
    *,
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[Union[PoolingParams,
                                   Sequence[PoolingParams]]] = None,
    lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
) -> list[PoolingRequestOutput]:
    """
    Generate rewards for each prompt.

    Args:
        prompts: The prompts to the LLM. You may pass a sequence of prompts
            for batch inference. See [PromptType][vllm.inputs.PromptType]
            for more details about the format of each prompts.
        use_tqdm: If `True`, shows a tqdm progress bar.
            If a callable (e.g., `functools.partial(tqdm, leave=False)`),
            it is used to create the progress bar.
            If `False`, no progress bar is created.
        lora_request: LoRA request to use for generation, if any.
        pooling_params: The pooling parameters for pooling. If None, we
            use the default pooling parameters.
    Returns:
        A list of `PoolingRequestOutput` objects containing the
        pooled hidden states in the same order as the input prompts.
    """

    return self.encode(
        prompts,
        use_tqdm=use_tqdm,
        lora_request=lora_request,
        pooling_params=pooling_params,
        truncate_prompt_tokens=truncate_prompt_tokens,
        pooling_task="encode",
    )

score

score(
    data_1: Union[
        SingletonPrompt,
        Sequence[SingletonPrompt],
        ScoreMultiModalParam,
    ],
    data_2: Union[
        SingletonPrompt,
        Sequence[SingletonPrompt],
        ScoreMultiModalParam,
    ],
    /,
    *,
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[PoolingParams] = None,
    lora_request: Optional[
        Union[list[LoRARequest], LoRARequest]
    ] = None,
) -> list[ScoringRequestOutput]

Generate similarity scores for all pairs <text,text_pair> or <multi-modal data, multi-modal data pair>.

The inputs can be 1 -> 1, 1 -> N or N -> N. In the 1 - N case the data_1 input will be replicated N times to pair with the data_2 inputs. The input pairs are used to build a list of prompts for the cross encoder model. This class automatically batches the prompts, considering the memory constraint. For the best performance, put all of your inputs into a single list and pass it to this method.

Supports both text and multi-modal data (images, etc.) when used with appropriate multi-modal models. For multi-modal inputs, ensure the prompt structure matches the model's expected input format.

Parameters:

Name Type Description Default
data_1 Union[SingletonPrompt, Sequence[SingletonPrompt], ScoreMultiModalParam]

Can be a single prompt, a list of prompts or ScoreMultiModalParam, which can contain either text or multi-modal data. When a list, it must have the same length as the data_2 list.

required
data_2 Union[SingletonPrompt, Sequence[SingletonPrompt], ScoreMultiModalParam]

The data to pair with the query to form the input to the LLM. Can be text or multi-modal data. See PromptType for more details about the format of each prompt.

required
use_tqdm Union[bool, Callable[..., tqdm]]

If True, shows a tqdm progress bar. If a callable (e.g., functools.partial(tqdm, leave=False)), it is used to create the progress bar. If False, no progress bar is created.

True
lora_request Optional[Union[list[LoRARequest], LoRARequest]]

LoRA request to use for generation, if any.

None
pooling_params Optional[PoolingParams]

The pooling parameters for pooling. If None, we use the default pooling parameters.

None

Returns: A list of ScoringRequestOutput objects containing the generated scores in the same order as the input prompts.

Source code in vllm/entrypoints/llm.py
def score(
    self,
    data_1: Union[SingletonPrompt, Sequence[SingletonPrompt],
                  ScoreMultiModalParam],
    data_2: Union[SingletonPrompt, Sequence[SingletonPrompt],
                  ScoreMultiModalParam],
    /,
    *,
    truncate_prompt_tokens: Optional[int] = None,
    use_tqdm: Union[bool, Callable[..., tqdm]] = True,
    pooling_params: Optional[PoolingParams] = None,
    lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
) -> list[ScoringRequestOutput]:
    """Generate similarity scores for all pairs `<text,text_pair>` or
      `<multi-modal data, multi-modal data pair>`.

    The inputs can be `1 -> 1`, `1 -> N` or `N -> N`.
    In the `1 - N` case the `data_1` input will be replicated `N`
    times to pair with the `data_2` inputs.
    The input pairs are used to build a list of prompts for the
    cross encoder model. This class automatically batches the prompts,
    considering the memory constraint. For the best performance, put all
    of your inputs into a single list and pass it to this method.

    Supports both text and multi-modal data (images, etc.) when used with
    appropriate multi-modal models. For multi-modal inputs, ensure the
    prompt structure matches the model's expected input format.

    Args:
        data_1: Can be a single prompt, a list of prompts or
            `ScoreMultiModalParam`, which can contain either text or
            multi-modal data. When a list, it must have the same length as
            the `data_2` list.
        data_2: The data to pair with the query to form the input to
            the LLM. Can be text or multi-modal data. See [PromptType]
            [vllm.inputs.PromptType] for more details about the format of
            each prompt.
        use_tqdm: If `True`, shows a tqdm progress bar.
            If a callable (e.g., `functools.partial(tqdm, leave=False)`),
            it is used to create the progress bar.
            If `False`, no progress bar is created.
        lora_request: LoRA request to use for generation, if any.
        pooling_params: The pooling parameters for pooling. If None, we
            use the default pooling parameters.
    Returns:
        A list of `ScoringRequestOutput` objects containing the
        generated scores in the same order as the input prompts.
    """
    model_config = self.llm_engine.model_config
    runner_type = model_config.runner_type
    if runner_type != "pooling":
        raise ValueError(
            "LLM.score() is only supported for pooling models. "
            "Try passing `--runner pooling` to use the model as a "
            "pooling model.")

    supported_tasks = self.supported_tasks
    if all(t not in supported_tasks for t in ("embed", "classify")):
        raise ValueError("Score API is not supported by this model. "
                         "Try converting the model using "
                         "`--convert embed` or `--convert classify`.")

    if (model_config.is_cross_encoder
            and getattr(model_config.hf_config, "num_labels", 0) != 1):
        raise ValueError("Score API is only enabled for num_labels == 1.")

    # the tokenizer for models such as
    # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing
    # lists of tokens to the `text` and `text_pair` kwargs
    tokenizer = self.get_tokenizer()

    if not model_config.is_multimodal_model:

        def check_data_type(data: Union[SingletonPrompt,
                                        Sequence[SingletonPrompt],
                                        ScoreMultiModalParam]):
            if isinstance(data, dict) and "content" in data:
                raise ValueError("ScoreMultiModalParam is not supported "
                                 f"for {model_config.architecture}")

        check_data_type(data_1)
        check_data_type(data_2)

        def ensure_str(prompt: SingletonPrompt):
            if isinstance(prompt, dict):
                if "multi_modal_data" in prompt:
                    raise ValueError("Multi-modal prompt is not "
                                     "supported for scoring")
                elif "prompt_token_ids" in prompt:
                    prompt = tokenizer.decode(
                        cast(TokensPrompt, prompt)["prompt_token_ids"])
                elif "prompt" in prompt:
                    prompt = cast(TextPrompt, prompt)["prompt"]
            assert type(prompt) is str
            return prompt

        if isinstance(data_1, (str, dict)):
            # Convert a single prompt to a list.
            data_1 = [data_1]  # type: ignore[list-item]

        data_1 = [ensure_str(t) for t in data_1]

        if isinstance(data_2, (str, dict)):
            # Convert a single prompt to a list.
            data_2 = [data_2]  # type: ignore[list-item]

        data_2 = [ensure_str(t) for t in data_2]

    if isinstance(data_1, dict) and "content" in data_1:
        data_1 = data_1.get("content")  # type: ignore[assignment]
    elif isinstance(data_1, str):
        data_1 = [data_1]

    if isinstance(data_2, dict) and "content" in data_2:
        data_2 = data_2.get("content")  # type: ignore[assignment]
    elif isinstance(data_2, str):
        data_2 = [data_2]

    _validate_score_input_lens(data_1, data_2)  # type: ignore[arg-type]

    if model_config.is_cross_encoder:
        return self._cross_encoding_score(
            tokenizer,
            data_1,  # type: ignore[arg-type]
            data_2,  # type: ignore[arg-type]
            truncate_prompt_tokens,
            use_tqdm,
            pooling_params,
            lora_request)
    else:
        return self._embedding_score(
            tokenizer,
            data_1,  # type: ignore[arg-type]
            data_2,  # type: ignore[arg-type]
            truncate_prompt_tokens,
            use_tqdm,
            pooling_params,
            lora_request)

set_tokenizer

set_tokenizer(tokenizer: AnyTokenizer) -> None
Source code in vllm/entrypoints/llm.py
def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
    tokenizer_group = self.llm_engine.get_tokenizer_group()

    # While CachedTokenizer is dynamic, have no choice but
    # compare class name. Misjudgment will arise from
    # user-defined tokenizer started with 'Cached'
    if tokenizer.__class__.__name__.startswith("Cached"):
        tokenizer_group.tokenizer = tokenizer
    else:
        tokenizer_group.tokenizer = get_cached_tokenizer(tokenizer)

sleep

sleep(level: int = 1)

Put the engine to sleep. The engine should not process any requests. The caller should guarantee that no requests are being processed during the sleep period, before wake_up is called.

Parameters:

Name Type Description Default
level int

The sleep level. Level 1 sleep will offload the model weights and discard the kv cache. The content of kv cache is forgotten. Level 1 sleep is good for sleeping and waking up the engine to run the same model again. The model weights are backed up in CPU memory. Please make sure there's enough CPU memory to store the model weights. Level 2 sleep will discard both the model weights and the kv cache. The content of both the model weights and kv cache is forgotten. Level 2 sleep is good for sleeping and waking up the engine to run a different model or update the model, where previous model weights are not needed. It reduces CPU memory pressure.

1
Source code in vllm/entrypoints/llm.py
def sleep(self, level: int = 1):
    """
    Put the engine to sleep. The engine should not process any requests.
    The caller should guarantee that no requests are being processed
    during the sleep period, before `wake_up` is called.

    Args:
        level: The sleep level. Level 1 sleep will offload the model
            weights and discard the kv cache. The content of kv cache
            is forgotten. Level 1 sleep is good for sleeping and waking
            up the engine to run the same model again. The model weights
            are backed up in CPU memory. Please make sure there's enough
            CPU memory to store the model weights. Level 2 sleep will
            discard both the model weights and the kv cache. The content
            of both the model weights and kv cache is forgotten. Level 2
            sleep is good for sleeping and waking up the engine to run a
            different model or update the model, where previous model
            weights are not needed. It reduces CPU memory pressure.
    """
    self.reset_prefix_cache()
    self.llm_engine.sleep(level=level)

start_profile

start_profile() -> None
Source code in vllm/entrypoints/llm.py
def start_profile(self) -> None:
    self.llm_engine.start_profile()

stop_profile

stop_profile() -> None
Source code in vllm/entrypoints/llm.py
def stop_profile(self) -> None:
    self.llm_engine.stop_profile()

wake_up

wake_up(tags: Optional[list[str]] = None)

Wake up the engine from sleep mode. See the [sleep][] method for more details.

Parameters:

Name Type Description Default
tags Optional[list[str]]

An optional list of tags to reallocate the engine memory for specific memory allocations. Values must be in ("weights", "kv_cache"). If None, all memory is reallocated. wake_up should be called with all tags (or None) before the engine is used again.

None
Source code in vllm/entrypoints/llm.py
def wake_up(self, tags: Optional[list[str]] = None):
    """
    Wake up the engine from sleep mode. See the [sleep][] method
    for more details.

    Args:
        tags: An optional list of tags to reallocate the engine memory
            for specific memory allocations. Values must be in
            `("weights", "kv_cache")`. If None, all memory is reallocated.
            wake_up should be called with all tags (or None) before the
            engine is used again.
    """
    self.llm_engine.wake_up(tags)

LLMEngine

An LLM engine that receives requests and generates texts.

This is the main class for the vLLM engine. It receives requests from clients and generates texts from the LLM. It includes a tokenizer, a language model (possibly distributed across multiple GPUs), and GPU memory space allocated for intermediate states (aka KV cache). This class utilizes iteration-level scheduling and efficient memory management to maximize the serving throughput.

The LLM class wraps this class for offline batched inference and the AsyncLLMEngine class wraps this class for online serving.

The config arguments are derived from EngineArgs.

Parameters:

Name Type Description Default
vllm_config VllmConfig

The configuration for initializing and running vLLM.

required
executor_class Type[ExecutorBase]

The model executor class for managing distributed execution.

required
log_stats bool

Whether to log statistics.

required
usage_context UsageContext

Specified entry point, used for usage info collection.

ENGINE_CONTEXT
Source code in vllm/engine/llm_engine.py
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
class LLMEngine:
    """An LLM engine that receives requests and generates texts.

    This is the main class for the vLLM engine. It receives requests
    from clients and generates texts from the LLM. It includes a tokenizer, a
    language model (possibly distributed across multiple GPUs), and GPU memory
    space allocated for intermediate states (aka KV cache). This class utilizes
    iteration-level scheduling and efficient memory management to maximize the
    serving throughput.

    The [`LLM`][vllm.LLM] class wraps this class for offline batched inference
    and the [`AsyncLLMEngine`][vllm.engine.async_llm_engine.AsyncLLMEngine]
    class wraps this class for online serving.

    The config arguments are derived from [`EngineArgs`][vllm.EngineArgs].

    Args:
        vllm_config: The configuration for initializing and running vLLM.
        executor_class: The model executor class for managing distributed
            execution.
        log_stats: Whether to log statistics.
        usage_context: Specified entry point, used for usage info collection.
    """

    DO_VALIDATE_OUTPUT: ClassVar[bool] = False
    """A flag to toggle whether to validate the type of request output."""

    @classmethod
    @contextmanager
    def enable_output_validation(cls):
        cls.DO_VALIDATE_OUTPUT = True

        yield

        cls.DO_VALIDATE_OUTPUT = False

    @classmethod
    def validate_output(
        cls,
        output: object,
        output_type: Type[_O],
    ) -> _O:
        do_validate = cls.DO_VALIDATE_OUTPUT

        if ((TYPE_CHECKING or do_validate)
                and not isinstance(output, output_type)):
            raise TypeError(f"Expected output of type {output_type}, "
                            f"but found type {type(output)}")

        return cast(_O, output)

    @classmethod
    def validate_outputs(
        cls,
        outputs: GenericSequence[object],
        output_type: Type[_O],
    ) -> List[_O]:
        do_validate = cls.DO_VALIDATE_OUTPUT

        outputs_: List[_O]
        if TYPE_CHECKING or do_validate:
            outputs_ = []
            for output in outputs:
                if not isinstance(output, output_type):
                    raise TypeError(f"Expected output of type {output_type}, "
                                    f"but found type {type(output)}")

                outputs_.append(output)
        else:
            outputs_ = outputs

        return outputs_

    tokenizer: Optional[TokenizerGroup]

    def __init__(
        self,
        vllm_config: VllmConfig,
        executor_class: Type[ExecutorBase],
        log_stats: bool,
        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
        use_cached_outputs: bool = False,
    ) -> None:
        if envs.VLLM_USE_V1:
            raise ValueError(
                "Using V0 LLMEngine, but envs.VLLM_USE_V1=True. "
                "This should not happen. As a workaround, try using "
                "LLMEngine.from_vllm_config(...) or explicitly set "
                "VLLM_USE_V1=0 or 1 and report this issue on Github.")

        self.vllm_config = vllm_config
        self.model_config = vllm_config.model_config
        self.cache_config = vllm_config.cache_config
        self.lora_config = vllm_config.lora_config
        self.parallel_config = vllm_config.parallel_config
        self.scheduler_config = vllm_config.scheduler_config
        self.device_config = vllm_config.device_config
        self.speculative_config = vllm_config.speculative_config  # noqa
        self.load_config = vllm_config.load_config
        self.decoding_config = vllm_config.decoding_config or DecodingConfig(  # noqa
        )
        self.observability_config = vllm_config.observability_config or ObservabilityConfig(  # noqa
        )

        logger.info(
            "Initializing a V0 LLM engine (v%s) with config: %s, "
            "use_cached_outputs=%s, ",
            VLLM_VERSION,
            vllm_config,
            use_cached_outputs,
        )

        self.log_stats = log_stats
        self.use_cached_outputs = use_cached_outputs

        if self.model_config.skip_tokenizer_init:
            self.tokenizer = None
            self.detokenizer = None
            tokenizer_group = None
        else:
            self.tokenizer = self._init_tokenizer()
            self.detokenizer = Detokenizer(self.tokenizer)
            tokenizer_group = self.get_tokenizer_group()

        # Ensure that the function doesn't contain a reference to self,
        # to avoid engine GC issues
        def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
            assert tokenizer_group, ("tokenizer_group cannot be None, "
                                     "make sure skip_tokenizer_init is False")
            return tokenizer_group.get_lora_tokenizer(sequence.lora_request)

        self.seq_counter = Counter()
        self.generation_config_fields = (
            self.model_config.try_get_generation_config())

        self.input_preprocessor = InputPreprocessor(self.model_config,
                                                    self.tokenizer,
                                                    mm_registry)

        self.model_executor = executor_class(vllm_config=vllm_config)

        if self.model_config.runner_type != "pooling":
            self._initialize_kv_caches()

        # If usage stat is enabled, collect relevant info.
        if is_usage_stats_enabled():
            from vllm.model_executor.model_loader import (
                get_architecture_class_name)
            usage_message.report_usage(
                get_architecture_class_name(self.model_config),
                usage_context,
                extra_kvs={
                    # Common configuration
                    "dtype":
                    str(self.model_config.dtype),
                    "tensor_parallel_size":
                    self.parallel_config.tensor_parallel_size,
                    "block_size":
                    self.cache_config.block_size,
                    "gpu_memory_utilization":
                    self.cache_config.gpu_memory_utilization,

                    # Quantization
                    "quantization":
                    self.model_config.quantization,
                    "kv_cache_dtype":
                    str(self.cache_config.cache_dtype),

                    # Feature flags
                    "enable_lora":
                    bool(self.lora_config),
                    "enable_prefix_caching":
                    self.cache_config.enable_prefix_caching,
                    "enforce_eager":
                    self.model_config.enforce_eager,
                    "disable_custom_all_reduce":
                    self.parallel_config.disable_custom_all_reduce,
                })

        self.cached_scheduler_outputs = [
            SchedulerOutputState()
            for _ in range(self.parallel_config.pipeline_parallel_size)
        ]

        self.scheduler_contexts = [
            SchedulerContext()
            for _ in range(self.parallel_config.pipeline_parallel_size)
        ]

        if self.model_config.use_async_output_proc:
            process_model_outputs = weak_bind(self._process_model_outputs)

            self.async_callbacks = [
                partial(process_model_outputs,
                        ctx=self.scheduler_contexts[v_id])
                for v_id in range(self.parallel_config.pipeline_parallel_size)
            ]
        else:
            self.async_callbacks = []

        # Currently used by AsyncLLMEngine to ensure quick append
        # of request outputs to asyncio queues
        self.process_request_outputs_callback: Optional[Callable] = None

        # Create the scheduler.
        # NOTE: the cache_config here have been updated with the numbers of
        # GPU and CPU blocks, which are profiled in the distributed executor.
        if isinstance(self.vllm_config.scheduler_config.scheduler_cls, str):
            Scheduler = resolve_obj_by_qualname(
                self.vllm_config.scheduler_config.scheduler_cls)
        else:
            Scheduler = self.vllm_config.scheduler_config.scheduler_cls
        self.scheduler = [
            Scheduler(
                self.scheduler_config, self.cache_config, self.lora_config,
                self.parallel_config.pipeline_parallel_size,
                self.async_callbacks[v_id]
                if self.model_config.use_async_output_proc else None)
            for v_id in range(self.parallel_config.pipeline_parallel_size)
        ]

        # Metric Logging.
        if self.log_stats:
            if stat_loggers is not None:
                self.stat_loggers = stat_loggers
            else:
                # Lazy import for prometheus multiprocessing.
                # We need to set PROMETHEUS_MULTIPROC_DIR environment variable
                # before prometheus_client is imported.
                # See https://prometheus.github.io/client_python/multiprocess/
                from vllm.engine.metrics import (LoggingStatLogger,
                                                 PrometheusStatLogger)

                self.stat_loggers = {
                    "logging":
                    LoggingStatLogger(
                        local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
                        vllm_config=vllm_config),
                    "prometheus":
                    PrometheusStatLogger(
                        local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
                        labels=dict(
                            model_name=self.model_config.served_model_name),
                        vllm_config=vllm_config),
                }
                self.stat_loggers["prometheus"].info("cache_config",
                                                     self.cache_config)

        self.tracer = None
        if self.observability_config.otlp_traces_endpoint:
            self.tracer = init_tracer(
                "vllm.llm_engine",
                self.observability_config.otlp_traces_endpoint)

        # Create sequence output processor, e.g. for beam search or
        # speculative decoding.
        self.output_processor = (
            SequenceGroupOutputProcessor.create_output_processor(
                self.scheduler_config,
                self.detokenizer,
                self.scheduler,
                self.seq_counter,
                get_tokenizer_for_seq,
                stop_checker=StopChecker(self.scheduler_config.max_model_len,
                                         get_tokenizer_for_seq),
            ))

        self.seq_id_to_seq_group: Dict[str, SequenceGroupBase] = {}

        # Flag to set when an input fails to process and the engine should run
        # the next step without re-scheduling.
        self._skip_scheduling_next_step = False

        # Don't keep the dummy data in memory
        self.reset_mm_cache()

    def _initialize_kv_caches(self) -> None:
        """Initialize the KV cache in the worker(s).

        The workers will determine the number of blocks in both the GPU cache
        and the swap CPU cache.
        """
        start = time.time()
        num_gpu_blocks, num_cpu_blocks = (
            self.model_executor.determine_num_available_blocks())

        if self.cache_config.num_gpu_blocks_override is not None:
            num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override
            logger.info(
                "Overriding num_gpu_blocks=%d with "
                "num_gpu_blocks_override=%d", num_gpu_blocks,
                num_gpu_blocks_override)
            num_gpu_blocks = num_gpu_blocks_override

        self.cache_config.num_gpu_blocks = num_gpu_blocks
        self.cache_config.num_cpu_blocks = num_cpu_blocks

        self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
        elapsed = time.time() - start
        logger.info(("init engine (profile, create kv cache, "
                     "warmup model) took %.2f seconds"), elapsed)

    @classmethod
    def _get_executor_cls(cls,
                          engine_config: VllmConfig) -> Type[ExecutorBase]:
        # distributed_executor_backend must be set in VllmConfig.__post_init__
        distributed_executor_backend = (
            engine_config.parallel_config.distributed_executor_backend)
        # Initialize the cluster and specify the executor class.
        if isinstance(distributed_executor_backend, type):
            if not issubclass(distributed_executor_backend, ExecutorBase):
                raise TypeError(
                    "distributed_executor_backend must be a subclass of "
                    f"ExecutorBase. Got {distributed_executor_backend}.")
            executor_class = distributed_executor_backend
        elif distributed_executor_backend == "ray":
            from vllm.executor.ray_distributed_executor import (
                RayDistributedExecutor)
            executor_class = RayDistributedExecutor
        elif distributed_executor_backend == "mp":
            from vllm.executor.mp_distributed_executor import (
                MultiprocessingDistributedExecutor)
            assert not envs.VLLM_USE_RAY_SPMD_WORKER, (
                "multiprocessing distributed executor backend does not "
                "support VLLM_USE_RAY_SPMD_WORKER=1")
            executor_class = MultiprocessingDistributedExecutor
        elif distributed_executor_backend == "uni":
            # JAX-style, single-process, multi-device executor.
            from vllm.executor.uniproc_executor import UniProcExecutor
            executor_class = UniProcExecutor
        elif distributed_executor_backend == "external_launcher":
            # executor with external launcher
            from vllm.executor.uniproc_executor import (  # noqa
                ExecutorWithExternalLauncher)
            executor_class = ExecutorWithExternalLauncher
        else:
            raise ValueError("unrecognized distributed_executor_backend: "
                             f"{distributed_executor_backend}")
        return executor_class

    @classmethod
    def from_vllm_config(
        cls,
        vllm_config: VllmConfig,
        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
        disable_log_stats: bool = False,
    ) -> "LLMEngine":
        return cls(
            vllm_config=vllm_config,
            executor_class=cls._get_executor_cls(vllm_config),
            log_stats=(not disable_log_stats),
            usage_context=usage_context,
            stat_loggers=stat_loggers,
        )

    @classmethod
    def from_engine_args(
        cls,
        engine_args: EngineArgs,
        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
    ) -> "LLMEngine":
        """Creates an LLM engine from the engine arguments."""
        # Create the engine configs.
        vllm_config = engine_args.create_engine_config(usage_context)

        engine_cls = cls
        if envs.VLLM_USE_V1:
            from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
            engine_cls = V1LLMEngine

        return engine_cls.from_vllm_config(
            vllm_config=vllm_config,
            usage_context=usage_context,
            stat_loggers=stat_loggers,
            disable_log_stats=engine_args.disable_log_stats,
        )

    def __reduce__(self):
        # This is to ensure that the LLMEngine is not referenced in
        # the closure used to initialize Ray worker actors
        raise RuntimeError("LLMEngine should not be pickled!")

    def __del__(self):
        # Shutdown model executor when engine is garbage collected
        # Use getattr since __init__ can fail before the field is set
        if model_executor := getattr(self, "model_executor", None):
            model_executor.shutdown()

    def get_tokenizer_group(self) -> TokenizerGroup:
        if self.tokenizer is None:
            raise ValueError("Unable to get tokenizer because "
                             "skip_tokenizer_init is True")

        return self.tokenizer

    def get_tokenizer(
        self,
        lora_request: Optional[LoRARequest] = None,
    ) -> AnyTokenizer:
        return self.get_tokenizer_group().get_lora_tokenizer(lora_request)

    def _init_tokenizer(self) -> TokenizerGroup:
        return init_tokenizer_from_configs(
            model_config=self.model_config,
            scheduler_config=self.scheduler_config,
            lora_config=self.lora_config)

    def _verify_args(self) -> None:
        self.model_config.verify_with_parallel_config(self.parallel_config)
        self.cache_config.verify_with_parallel_config(self.parallel_config)
        if self.lora_config:
            self.lora_config.verify_with_model_config(self.model_config)
            self.lora_config.verify_with_scheduler_config(
                self.scheduler_config)

    def _add_processed_request(
        self,
        request_id: str,
        processed_inputs: ProcessorInputs,
        params: Union[SamplingParams, PoolingParams],
        arrival_time: float,
        lora_request: Optional[LoRARequest],
        trace_headers: Optional[Mapping[str, str]] = None,
        priority: int = 0,
    ) -> Optional[SequenceGroup]:
        """Add a processed request to the engine's request pool.
        return the created sequence group.
        """
        if isinstance(params, SamplingParams) and params.n > 1:
            ParallelSampleSequenceGroup.add_request(
                request_id,
                self,
                params,
                processed_inputs=processed_inputs,
                arrival_time=arrival_time,
                lora_request=lora_request,
                trace_headers=trace_headers,
                priority=priority,
            )
            return None

        self._validate_model_inputs(processed_inputs, lora_request)
        # Create the sequences.
        block_size = self.cache_config.block_size
        seq_id = next(self.seq_counter)
        eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)

        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)

        seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id,
                       lora_request)

        encoder_seq = (None if encoder_inputs is None else Sequence(
            seq_id, encoder_inputs, block_size, eos_token_id, lora_request))

        # Create a SequenceGroup based on SamplingParams or PoolingParams
        if isinstance(params, SamplingParams):
            seq_group = self._create_sequence_group_with_sampling(
                request_id,
                seq,
                params,
                arrival_time=arrival_time,
                lora_request=lora_request,
                trace_headers=trace_headers,
                encoder_seq=encoder_seq,
                priority=priority)
        elif isinstance(params, PoolingParams):
            seq_group = self._create_sequence_group_with_pooling(
                request_id,
                seq,
                params,
                arrival_time=arrival_time,
                lora_request=lora_request,
                encoder_seq=encoder_seq,
                priority=priority)
        else:
            raise ValueError(
                "Either SamplingParams or PoolingParams must be provided.")

        # Add the sequence group to the scheduler with least unfinished seqs.
        costs = [
            scheduler.get_num_unfinished_seq_groups()
            for scheduler in self.scheduler
        ]
        min_cost_scheduler = self.scheduler[costs.index(min(costs))]
        min_cost_scheduler.add_seq_group(seq_group)

        return seq_group

    def stop_remote_worker_execution_loop(self) -> None:
        self.model_executor.stop_remote_worker_execution_loop()

    def add_request(
        self,
        request_id: str,
        prompt: PromptType,
        params: Union[SamplingParams, PoolingParams],
        arrival_time: Optional[float] = None,
        lora_request: Optional[LoRARequest] = None,
        tokenization_kwargs: Optional[dict[str, Any]] = None,
        trace_headers: Optional[Mapping[str, str]] = None,
        priority: int = 0,
    ) -> None:
        """Add a request to the engine's request pool.

        The request is added to the request pool and will be processed by the
        scheduler as `engine.step()` is called. The exact scheduling policy is
        determined by the scheduler.

        Args:
            request_id: The unique ID of the request.
            prompt: The prompt to the LLM. See
                [PromptType][vllm.inputs.PromptType]
                for more details about the format of each input.
            params: Parameters for sampling or pooling.
                [SamplingParams][vllm.SamplingParams] for text generation.
                [PoolingParams][vllm.PoolingParams] for pooling.
            arrival_time: The arrival time of the request. If None, we use
                the current monotonic time.
            lora_request: The LoRA request to add.
            trace_headers: OpenTelemetry trace headers.
            priority: The priority of the request.
                Only applicable with priority scheduling.

        Details:
            - Set arrival_time to the current time if it is None.
            - Set prompt_token_ids to the encoded prompt if it is None.
            - Create `n` number of [Sequence][vllm.Sequence] objects.
            - Create a [SequenceGroup][vllm.SequenceGroup] object
              from the list of [Sequence][vllm.Sequence].
            - Add the [SequenceGroup][vllm.SequenceGroup] object to the
              scheduler.

        Example:
            >>> # initialize engine
            >>> engine = LLMEngine.from_engine_args(engine_args)
            >>> # set request arguments
            >>> example_prompt = "Who is the president of the United States?"
            >>> sampling_params = SamplingParams(temperature=0.0)
            >>> request_id = 0
            >>>
            >>> # add the request to the engine
            >>> engine.add_request(
            >>>    str(request_id),
            >>>    example_prompt,
            >>>    SamplingParams(temperature=0.0))
            >>> # continue the request processing
            >>> ...
        """
        if not isinstance(request_id, str):
            raise TypeError(
                f"request_id must be a string, got {type(request_id)}")

        if lora_request is not None and not self.lora_config:
            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
                             "not enabled!")

        if priority != 0 and not self.scheduler_config.policy == "priority":
            raise ValueError(f"Got priority {priority} but "
                             "Priority scheduling is not enabled.")

        if isinstance(params, SamplingParams) \
            and params.logits_processors:
            raise ValueError(
                "Logits processors are not supported in multi-step decoding")

        if arrival_time is None:
            arrival_time = time.time()

        if (isinstance(prompt, dict)
                and prompt.get("prompt_embeds", None) is not None
                and not prompt.get("prompt_token_ids", None)):
            seq_len = prompt["prompt_embeds"].shape[0]
            prompt["prompt_token_ids"] = [0] * seq_len

        processed_inputs = self.input_preprocessor.preprocess(
            prompt,
            tokenization_kwargs=tokenization_kwargs,
            lora_request=lora_request,
        )

        self._add_processed_request(
            request_id=request_id,
            processed_inputs=processed_inputs,
            params=params,
            arrival_time=arrival_time,
            lora_request=lora_request,
            trace_headers=trace_headers,
            priority=priority,
        )

    def _create_sequence_group_with_sampling(
        self,
        request_id: str,
        seq: Sequence,
        sampling_params: SamplingParams,
        arrival_time: float,
        lora_request: Optional[LoRARequest],
        trace_headers: Optional[Mapping[str, str]] = None,
        encoder_seq: Optional[Sequence] = None,
        priority: int = 0,
    ) -> SequenceGroup:
        """Creates a SequenceGroup with SamplingParams."""
        max_logprobs = self.get_model_config().max_logprobs
        if (sampling_params.logprobs
                and sampling_params.logprobs > max_logprobs) or (
                    sampling_params.prompt_logprobs
                    and sampling_params.prompt_logprobs > max_logprobs):
            raise ValueError(f"Cannot request more than "
                             f"{max_logprobs} logprobs.")

        sampling_params = self._build_logits_processors(
            sampling_params, lora_request)

        # Defensive copy of SamplingParams, which are used by the sampler,
        # this doesn't deep-copy LogitsProcessor objects
        sampling_params = sampling_params.clone()

        sampling_params.update_from_generation_config(
            self.generation_config_fields, seq.eos_token_id)

        # Create the sequence group.
        draft_size = 1
        if self.vllm_config.speculative_config is not None:
            draft_size = \
                self.vllm_config.speculative_config.num_speculative_tokens + 1
        seq_group = SequenceGroup(request_id=request_id,
                                  seqs=[seq],
                                  arrival_time=arrival_time,
                                  sampling_params=sampling_params,
                                  lora_request=lora_request,
                                  trace_headers=trace_headers,
                                  encoder_seq=encoder_seq,
                                  priority=priority,
                                  draft_size=draft_size)

        return seq_group

    def _create_sequence_group_with_pooling(
        self,
        request_id: str,
        seq: Sequence,
        pooling_params: PoolingParams,
        arrival_time: float,
        lora_request: Optional[LoRARequest],
        encoder_seq: Optional[Sequence] = None,
        priority: int = 0,
    ) -> SequenceGroup:
        """Creates a SequenceGroup with PoolingParams."""
        # Defensive copy of PoolingParams, which are used by the pooler
        pooling_params = pooling_params.clone()
        # Create the sequence group.
        seq_group = SequenceGroup(request_id=request_id,
                                  seqs=[seq],
                                  arrival_time=arrival_time,
                                  lora_request=lora_request,
                                  pooling_params=pooling_params,
                                  encoder_seq=encoder_seq,
                                  priority=priority)
        return seq_group

    def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
        """Aborts a request(s) with the given ID.

        Args:
            request_id: The ID(s) of the request to abort.

        Details:
            - Refer to [vllm.core.scheduler.Scheduler.abort_seq_group][].

        Example:
            >>> # initialize engine and add a request with request_id
            >>> request_id = str(0)
            >>> # abort the request
            >>> engine.abort_request(request_id)
        """
        for scheduler in self.scheduler:
            scheduler.abort_seq_group(
                request_id, seq_id_to_seq_group=self.seq_id_to_seq_group)

    def get_vllm_config(self) -> VllmConfig:
        """Gets the vllm configuration."""
        return self.vllm_config

    def get_model_config(self) -> ModelConfig:
        """Gets the model configuration."""
        return self.model_config

    def get_parallel_config(self) -> ParallelConfig:
        """Gets the parallel configuration."""
        return self.parallel_config

    def get_decoding_config(self) -> DecodingConfig:
        """Gets the decoding configuration."""
        return self.decoding_config

    def get_scheduler_config(self) -> SchedulerConfig:
        """Gets the scheduler configuration."""
        return self.scheduler_config

    def get_lora_config(self) -> LoRAConfig:
        """Gets the LoRA configuration."""
        return self.lora_config

    def get_num_unfinished_requests(self) -> int:
        """Gets the number of unfinished requests."""
        return sum(scheduler.get_num_unfinished_seq_groups()
                   for scheduler in self.scheduler)

    def has_unfinished_requests(self) -> bool:
        """Returns True if there are unfinished requests."""
        return any(scheduler.has_unfinished_seqs()
                   for scheduler in self.scheduler)

    def has_unfinished_requests_for_virtual_engine(
            self, virtual_engine: int) -> bool:
        """
        Returns True if there are unfinished requests for the virtual engine.
        """
        return self.scheduler[virtual_engine].has_unfinished_seqs()

    def reset_mm_cache(self) -> bool:
        """Reset the multi-modal cache."""
        return self.input_preprocessor.mm_registry.reset_processor_cache(
            self.model_config)

    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
        """Reset prefix cache for all devices."""

        success = True
        for scheduler in self.scheduler:
            success = success and scheduler.reset_prefix_cache(device)
        return success

    @staticmethod
    def _process_sequence_group_outputs(
        seq_group: SequenceGroup,
        outputs: List[PoolingSequenceGroupOutput],
    ) -> None:
        seq_group.pooled_data = outputs[0].data

        for seq in seq_group.get_seqs():
            seq.status = SequenceStatus.FINISHED_STOPPED

        return

    def _process_model_outputs(self,
                               ctx: SchedulerContext,
                               request_id: Optional[str] = None) -> None:
        """Apply the model output to the sequences in the scheduled seq groups
        and return responses.

        ctx: The virtual engine context to work on
        request_id: If provided, then only this request is going to be processed
        """

        now = time.time()

        if len(ctx.output_queue) == 0:
            return None

        # Get pending async postprocessor
        if request_id:
            # When we process only one request, no pop is required
            # (since later we will process all of the rest)
            (outputs, seq_group_metadata_list, scheduler_outputs, is_async,
             is_last_step, is_first_step_output, skip) = ctx.output_queue[0]
        else:
            (outputs, seq_group_metadata_list, scheduler_outputs, is_async,
             is_last_step, is_first_step_output,
             skip) = ctx.output_queue.popleft()

        # Sanity check
        assert len(seq_group_metadata_list) == len(
            scheduler_outputs.scheduled_seq_groups)

        has_multiple_outputs: bool = len(outputs) > 1
        outputs_by_sequence_group: List[List[SequenceGroupOutput]]
        assert not has_multiple_outputs
        outputs_by_sequence_group = outputs

        # Determine the requests we need to operate on
        if request_id:
            indices = []
            for i, seq_group_meta in enumerate(seq_group_metadata_list):
                if seq_group_meta.request_id == request_id:
                    assert i not in skip  # Cannot be called twice
                    indices.append(i)
                    break

            # If the request_id was not found, then it means that
            # this is a new request that has no pending async
            # postprocessor
            if not indices:
                return
        else:
            indices = range(len(seq_group_metadata_list))  # type: ignore

        finished_before: List[int] = []
        finished_now: List[int] = []
        for i in indices:
            if i in skip:
                continue

            seq_group_meta = seq_group_metadata_list[i]
            scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]

            seq_group: SequenceGroup = scheduled_seq_group.seq_group

            if seq_group.is_finished():
                finished_before.append(i)
                continue

            output: List[SequenceGroupOutput]
            if has_multiple_outputs:
                output = outputs_by_sequence_group[i]
            else:
                output = [outputs_by_sequence_group[0][i]]

            if not is_async:
                seq_group.update_num_computed_tokens(
                    seq_group_meta.token_chunk_size or 0)

            if outputs:
                for o in outputs:
                    if (isinstance(o, SamplerOutput)
                            and seq_group.metrics is not None):
                        if seq_group.metrics.model_forward_time is not None:
                            seq_group.metrics.model_forward_time += (
                                o.model_forward_time or 0)
                        else:
                            seq_group.metrics.model_forward_time = (
                                o.model_forward_time)
                        if seq_group.metrics.model_execute_time is not None:
                            seq_group.metrics.model_execute_time += (
                                o.model_execute_time or 0)
                        else:
                            seq_group.metrics.model_execute_time = (
                                o.model_execute_time)

            if self.model_config.runner_type == "pooling":
                self._process_sequence_group_outputs(seq_group, output)
            else:
                self.output_processor.process_prompt_logprob(seq_group, output)
                if seq_group_meta.do_sample:
                    self.output_processor.process_outputs(
                        seq_group, output, is_async)

            if seq_group.is_finished():
                finished_now.append(i)

        # Generate outputs for the requests that finished this iteration
        for i in finished_now:
            scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]

            seq_group = scheduled_seq_group.seq_group
            seq_group.maybe_set_first_token_time(now)
            if not seq_group.is_prefill():
                seq_group.set_last_token_time(now)
            request_output = RequestOutputFactory.create(
                seq_group,
                self.seq_id_to_seq_group,
                use_cache=self.use_cached_outputs)
            if request_output:
                ctx.request_outputs.append(request_output)

        # When we process a single request, we skip it for the next time,
        # and invoke the request output callback (if there was final output)
        if request_id:
            assert len(indices) == 1
            skip.append(indices[0])

            if (finished_now
                    and self.process_request_outputs_callback is not None):
                self.process_request_outputs_callback(ctx.request_outputs)
                ctx.request_outputs.clear()
            return

        # Free currently finished requests
        if finished_now:
            for scheduler in self.scheduler:
                scheduler.free_finished_seq_groups()

        # Create the outputs
        for i in indices:
            if i in skip or i in finished_before or i in finished_now:
                continue  # Avoids double processing

            scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]

            seq_group = scheduled_seq_group.seq_group
            seq_group.maybe_set_first_token_time(now)
            if not seq_group.is_prefill():
                seq_group.set_last_token_time(now)
            request_output = RequestOutputFactory.create(
                seq_group,
                self.seq_id_to_seq_group,
                use_cache=self.use_cached_outputs)
            if request_output:
                ctx.request_outputs.append(request_output)

        # Create outputs only after processing the scheduler's results

        for seq_group in scheduler_outputs.ignored_seq_groups:
            params = seq_group.sampling_params
            if params is not None and params.output_kind == (
                    RequestOutputKind.DELTA) and not seq_group.is_finished():
                continue

            request_output = RequestOutputFactory.create(
                seq_group,
                self.seq_id_to_seq_group,
                use_cache=self.use_cached_outputs,
            )
            if request_output:
                ctx.request_outputs.append(request_output)

        # Immediately process request outputs here (if callback is given)
        if (ctx.request_outputs
                and self.process_request_outputs_callback is not None):
            self.process_request_outputs_callback(ctx.request_outputs)
            ctx.request_outputs.clear()

        # For async case, we need to record the stats here.
        # For non-async case, the stats are done in the
        # LLMEngine/AsyncLLMEngine directly
        if is_async:
            # Log stats.
            self.do_log_stats(scheduler_outputs, outputs, finished_before,
                              skip)

            # Tracing
            self.do_tracing(scheduler_outputs, finished_before)

        return None

    def _advance_to_next_step(
            self, output: SamplerOutput,
            seq_group_metadata_list: List[SequenceGroupMetadata],
            scheduled_seq_groups: List[ScheduledSequenceGroup]) -> None:
        """Given model output from a single run, append the tokens to the
        sequences. This is normally done inside output processor, but it is
        required if the worker is to perform async forward pass to next step.
        """
        for seq_group_metadata, sequence_group_outputs, scheduled_seq_group in \
            zip(seq_group_metadata_list, output, scheduled_seq_groups):
            seq_group = scheduled_seq_group.seq_group

            if seq_group.is_finished():
                continue

            token_chunk_size = (seq_group_metadata.token_chunk_size
                                if seq_group_metadata.token_chunk_size
                                is not None else 0)
            seq_group.update_num_computed_tokens(token_chunk_size)

            if seq_group_metadata.do_sample:
                assert len(sequence_group_outputs.samples) == 1, (
                    "Async output processor expects a single sample"
                    " (i.e sampling_params.n == 1)")
                sample = sequence_group_outputs.samples[0]

                assert len(seq_group.seqs) == 1
                seq = seq_group.seqs[0]

                seq.append_token_id(sample.output_token, sample.logprobs,
                                    sample.output_embed)

    def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
        """Performs one decoding iteration and returns newly generated results.

        <figure markdown="span">
        ![Overview of the step function](https://i.imgur.com/sv2HssD.png)
        <figcaption>Overview of the step function</figcaption>
        </figure>

        Details:
        - Step 1: Schedules the sequences to be executed in the next
            iteration and the token blocks to be swapped in/out/copy.

            - Depending on the scheduling policy,
                sequences may be `preempted/reordered`.
            - A Sequence Group (SG) refer to a group of sequences
                that are generated from the same prompt.

        - Step 2: Calls the distributed executor to execute the model.
        - Step 3: Processes the model output. This mainly includes:

            - Decodes the relevant outputs.
            - Updates the scheduled sequence groups with model outputs
                based on its `sampling parameters` (`use_beam_search` or not).
            - Frees the finished sequence groups.

        - Finally, it creates and returns the newly generated results.

        Example:
        ```
        # Please see the example/ folder for more detailed examples.

        # initialize engine and request arguments
        engine = LLMEngine.from_engine_args(engine_args)
        example_inputs = [(0, "What is LLM?",
        SamplingParams(temperature=0.0))]

        # Start the engine with an event loop
        while True:
            if example_inputs:
                req_id, prompt, sampling_params = example_inputs.pop(0)
                engine.add_request(str(req_id),prompt,sampling_params)

            # continue the request processing
            request_outputs = engine.step()
            for request_output in request_outputs:
                if request_output.finished:
                    # return or show the request output

            if not (engine.has_unfinished_requests() or example_inputs):
                break
        ```
        """
        if self.parallel_config.pipeline_parallel_size > 1:
            raise NotImplementedError(
                "Pipeline parallelism is only supported through AsyncLLMEngine "
                "as performance will be severely degraded otherwise.")

        # For llm_engine, there is no pipeline parallel support, so the engine
        # used is always 0.
        virtual_engine = 0

        # These are cached outputs from previous iterations. None if on first
        # iteration
        cached_outputs = self.cached_scheduler_outputs[virtual_engine]
        seq_group_metadata_list = cached_outputs.seq_group_metadata_list
        scheduler_outputs = cached_outputs.scheduler_outputs
        allow_async_output_proc = cached_outputs.allow_async_output_proc

        ctx = self.scheduler_contexts[virtual_engine]

        # Clear outputs for each new scheduler iteration
        ctx.request_outputs.clear()

        # Skip the scheduler if there are any remaining steps in the seq groups.
        # This ensures that the scheduler is only called again when the current
        # batch has completed.
        # The scheduler is also skipped if a single request caused the last
        # engine step to fail, and the previous schedule needs to be rerun.
        if not self._has_remaining_steps(
                seq_group_metadata_list
        ) and not self._skip_scheduling_next_step:
            # Schedule iteration
            (seq_group_metadata_list, scheduler_outputs,
             allow_async_output_proc
             ) = self.scheduler[virtual_engine].schedule()

            ctx.seq_group_metadata_list = seq_group_metadata_list
            ctx.scheduler_outputs = scheduler_outputs

            finished_requests_ids = self.scheduler[
                virtual_engine].get_and_reset_finished_requests_ids()
            # When n>1, elements in self.seq_id_to_seq_group should be deleted
            # here, otherwise memory leaks.
            for finished_request_id in finished_requests_ids:
                if finished_request_id in self.seq_id_to_seq_group:
                    del self.seq_id_to_seq_group[finished_request_id]

            # Maybe switch from async mode to sync mode
            if not allow_async_output_proc and len(ctx.output_queue) > 0:
                self._process_model_outputs(ctx=ctx)

        else:
            finished_requests_ids = list()

        assert seq_group_metadata_list is not None
        assert scheduler_outputs is not None

        if not scheduler_outputs.is_empty():

            # Check if we have a cached last_output from the previous iteration.
            # For supporting PP this is probably the best way to pass the
            # sampled_token_ids, as a separate broadcast over all the PP stages
            # will cause one virtual engine's microbatch to block the pipeline.
            last_sampled_token_ids = \
                self._get_last_sampled_token_ids(virtual_engine)

            execute_model_req = ExecuteModelRequest(
                seq_group_metadata_list=seq_group_metadata_list,
                blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
                blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
                blocks_to_copy=scheduler_outputs.blocks_to_copy,
                num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
                running_queue_size=scheduler_outputs.running_queue_size,
                finished_requests_ids=finished_requests_ids,
                # We use ExecuteModelRequest to pass the last sampled_token_ids
                # to each of the non-last PP stages for in-place prepare_input.
                last_sampled_token_ids=last_sampled_token_ids)

            if allow_async_output_proc:
                execute_model_req.async_callback = self.async_callbacks[
                    virtual_engine]

            try:
                outputs = self.model_executor.execute_model(
                    execute_model_req=execute_model_req)
                self._skip_scheduling_next_step = False
            except InputProcessingError as e:
                # The input for this request cannot be processed, so we must
                # abort it. If there are remaining requests in the batch that
                # have been scheduled, they will be retried on the next step.
                invalid_request_id = e.request_id
                self._abort_and_cache_schedule(
                    request_id=invalid_request_id,
                    virtual_engine=virtual_engine,
                    seq_group_metadata_list=seq_group_metadata_list,
                    scheduler_outputs=scheduler_outputs,
                    allow_async_output_proc=allow_async_output_proc)
                # Raise so the caller is notified that this request failed
                raise

        else:
            # Nothing scheduled => If there is pending async postprocessor,
            # then finish it here.
            if len(ctx.output_queue) > 0:
                self._process_model_outputs(ctx=ctx)
            # No outputs in this case
            outputs = []

        if not self._has_remaining_steps(seq_group_metadata_list):
            # is_first_step_output is True only when the num_steps of all
            # the sequences are 1.
            is_first_step_output: bool = False if not seq_group_metadata_list \
                else seq_group_metadata_list[0].state.num_steps == 1

            # Add results to the output_queue
            ctx.append_output(outputs=outputs,
                              seq_group_metadata_list=seq_group_metadata_list,
                              scheduler_outputs=scheduler_outputs,
                              is_async=allow_async_output_proc,
                              is_last_step=True,
                              is_first_step_output=is_first_step_output)

            if outputs and allow_async_output_proc:
                assert len(outputs) == 1, (
                    "Async postprocessor expects only a single output set")

                self._advance_to_next_step(
                    outputs[0], seq_group_metadata_list,
                    scheduler_outputs.scheduled_seq_groups)

            # Check if need to run the usual non-async path
            if not allow_async_output_proc:
                self._process_model_outputs(ctx=ctx)

                # Log stats.
                self.do_log_stats(scheduler_outputs, outputs)

                # Tracing
                self.do_tracing(scheduler_outputs)
        else:
            # Multi-step case
            return ctx.request_outputs

        if not self.has_unfinished_requests():
            # Drain async postprocessor (if exists)
            if len(ctx.output_queue) > 0:
                self._process_model_outputs(ctx=ctx)
            assert len(ctx.output_queue) == 0

            # Stop the execute model loop in parallel workers until there are
            # more requests to process. This avoids waiting indefinitely in
            # torch.distributed ops which may otherwise timeout, and unblocks
            # the RPC thread in the workers so that they can process any other
            # queued control plane messages, such as add/remove lora adapters.
            logger.debug("Stopping remote worker execution loop.")
            self.model_executor.stop_remote_worker_execution_loop()

        return ctx.request_outputs

    def _abort_and_cache_schedule(
            self, request_id: str, virtual_engine: int,
            seq_group_metadata_list: List[SequenceGroupMetadata],
            scheduler_outputs: SchedulerOutputs,
            allow_async_output_proc: bool) -> None:
        """Aborts a single request, and caches the scheduler outputs minus that
        request. This allows the next step to continue processing the remaining
        requests without having to re-run the scheduler."""

        # Abort the request and remove its sequence group from the current
        # schedule
        self.abort_request(request_id)
        for i, metadata in enumerate(seq_group_metadata_list):
            if metadata.request_id == request_id:
                del seq_group_metadata_list[i]
                break
        for i, group in enumerate(scheduler_outputs.scheduled_seq_groups):
            if group.seq_group.request_id == request_id:
                del scheduler_outputs.scheduled_seq_groups[i]
                break

        # If there are still other sequence groups left in the schedule, cache
        # them and flag the engine to reuse the schedule.
        if len(seq_group_metadata_list) > 0:
            self._skip_scheduling_next_step = True
            # Reuse multi-step caching logic
            self._cache_scheduler_outputs_for_multi_step(
                virtual_engine=virtual_engine,
                scheduler_outputs=scheduler_outputs,
                seq_group_metadata_list=seq_group_metadata_list,
                allow_async_output_proc=allow_async_output_proc)

    def _has_remaining_steps(
        self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]]
    ) -> bool:
        return False

    def _cache_scheduler_outputs_for_multi_step(
            self, virtual_engine: int,
            seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
            scheduler_outputs: SchedulerOutputs,
            allow_async_output_proc: bool) -> None:
        co = self.cached_scheduler_outputs[virtual_engine]

        co.seq_group_metadata_list = seq_group_metadata_list
        co.scheduler_outputs = scheduler_outputs
        co.allow_async_output_proc = allow_async_output_proc
        co.last_output = None

    def _update_cached_scheduler_output(
            self, virtual_engine: int,
            output: List[Optional[SamplerOutput]]) -> None:
        if (self.parallel_config.pipeline_parallel_size > 1 and len(output) > 0
                and output[0] is not None):
            last_output = output[-1]
            assert last_output is not None
            assert last_output.sampled_token_ids_cpu is not None
            assert last_output.sampled_token_ids is None
            assert last_output.sampled_token_probs is None
            self.cached_scheduler_outputs[
                virtual_engine].last_output = last_output

    def _get_last_sampled_token_ids(
            self, virtual_engine: int) -> Optional[torch.Tensor]:
        return None

    def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
        if not self.log_stats:
            raise RuntimeError(
                "Stat logging is disabled. Set `disable_log_stats=False` "
                "argument to enable.")
        if logger_name in self.stat_loggers:
            raise KeyError(f"Logger with name {logger_name} already exists.")
        self.stat_loggers[logger_name] = logger

    def remove_logger(self, logger_name: str) -> None:
        if not self.log_stats:
            raise RuntimeError(
                "Stat logging is disabled. Set `disable_log_stats=False` "
                "argument to enable.")
        if logger_name not in self.stat_loggers:
            raise KeyError(f"Logger with name {logger_name} does not exist.")
        del self.stat_loggers[logger_name]

    def do_log_stats(self,
                     scheduler_outputs: Optional[SchedulerOutputs] = None,
                     model_output: Optional[List[SamplerOutput]] = None,
                     finished_before: Optional[List[int]] = None,
                     skip: Optional[List[int]] = None) -> None:
        """Forced log when no requests active."""
        if self.log_stats:
            stats = self._get_stats(scheduler_outputs, model_output,
                                    finished_before, skip)
            for logger in self.stat_loggers.values():
                logger.log(stats)

    def _get_stats(self,
                   scheduler_outputs: Optional[SchedulerOutputs],
                   model_output: Optional[List[SamplerOutput]] = None,
                   finished_before: Optional[List[int]] = None,
                   skip: Optional[List[int]] = None) -> Stats:
        """Get Stats to be Logged to Prometheus.

        Args:
            scheduler_outputs: Optional, used to populate metrics related to
                the scheduled batch,
            model_output: Optional, used to emit speculative decoding metrics
                which are created by the workers.
            finished_before: Optional, indices of sequences that were finished
                before. These sequences will be ignored.
            skip: Optional, indices of sequences that were preempted. These
                sequences will be ignored.
        """
        now = time.time()

        # System State
        #   Scheduler State
        num_running_sys = sum(
            len(scheduler.running) for scheduler in self.scheduler)
        num_swapped_sys = sum(
            len(scheduler.swapped) for scheduler in self.scheduler)
        num_waiting_sys = sum(
            len(scheduler.waiting) for scheduler in self.scheduler)

        # KV Cache Usage in %
        num_total_gpu = self.cache_config.num_gpu_blocks
        gpu_cache_usage_sys = 0.
        if num_total_gpu:  # Guard against both None and 0
            num_free_gpu = sum(
                scheduler.block_manager.get_num_free_gpu_blocks()
                for scheduler in self.scheduler)
            gpu_cache_usage_sys = 1.0 - (num_free_gpu / num_total_gpu)

        num_total_cpu = self.cache_config.num_cpu_blocks
        cpu_cache_usage_sys = 0.
        if num_total_cpu:  # Guard against both None and 0
            num_free_cpu = sum(
                scheduler.block_manager.get_num_free_cpu_blocks()
                for scheduler in self.scheduler)
            cpu_cache_usage_sys = 1.0 - (num_free_cpu / num_total_cpu)

        # Prefix Cache Hit Rate. Note that we always use
        # the cache hit rate of the first virtual engine.
        cpu_prefix_cache_hit_rate = self.scheduler[
            0].get_prefix_cache_hit_rate(Device.CPU)
        gpu_prefix_cache_hit_rate = self.scheduler[
            0].get_prefix_cache_hit_rate(Device.GPU)

        # Exchange the uasge and cache hit stats between gpu and cpu when
        # running on cpu because the cpu_worker.py intentionally reports the
        # number of cpu blocks as gpu blocks in favor of cache management.
        if self.device_config.device_type == "cpu":
            num_total_gpu, num_total_cpu = num_total_cpu, num_total_gpu
            gpu_cache_usage_sys, cpu_cache_usage_sys = (
                cpu_cache_usage_sys,
                gpu_cache_usage_sys,
            )
            gpu_prefix_cache_hit_rate, cpu_prefix_cache_hit_rate = (
                cpu_prefix_cache_hit_rate,
                gpu_prefix_cache_hit_rate,
            )

        # Iteration stats
        num_prompt_tokens_iter = 0
        num_generation_tokens_iter = 0
        num_tokens_iter = 0
        time_to_first_tokens_iter: List[float] = []
        time_per_output_tokens_iter: List[float] = []
        num_preemption_iter = (0 if scheduler_outputs is None else
                               scheduler_outputs.preempted)

        # Request stats
        #   Latency
        time_e2e_requests: List[float] = []
        time_queue_requests: List[float] = []
        time_inference_requests: List[float] = []
        time_prefill_requests: List[float] = []
        time_decode_requests: List[float] = []
        #   Metadata
        num_prompt_tokens_requests: List[int] = []
        num_generation_tokens_requests: List[int] = []
        n_requests: List[int] = []
        max_num_generation_tokens_requests: List[int] = []
        max_tokens_requests: List[int] = []
        finished_reason_requests: List[str] = []

        # LoRA requests
        running_lora_adapters = dict(
            collectionsCounter([
                running_request.lora_request.lora_name
                for scheduler in self.scheduler
                for running_request in scheduler.running
                if running_request.lora_request
            ]))
        waiting_lora_adapters = dict(
            collectionsCounter([
                waiting_request.lora_request.lora_name
                for scheduler in self.scheduler
                for waiting_request in scheduler.waiting
                if waiting_request.lora_request
            ]))
        max_lora_stat = "0"
        if self.lora_config:
            max_lora_stat = str(self.lora_config.max_loras)

        # NOTE: This loop assumes prefill seq_groups are before
        # decode seq_groups in scheduled_seq_groups.
        if scheduler_outputs is not None:
            # For async postprocessor, already finished sequences need to be
            # not counted (to avoid double counting)
            actual_num_batched_tokens = scheduler_outputs.num_batched_tokens  # type: ignore

            num_generation_tokens_from_prefill_groups = 0
            # NOTE: if scheduler_outputs.num_prefill_groups > 0 and
            # the len of scheduler_outputs.scheduled_seq_groups is !=
            # scheduler_outputs.num_prefill_groups, this means that
            # chunked prefills have been detected.

            for idx, scheduled_seq_group in enumerate(
                    scheduler_outputs.scheduled_seq_groups):
                # Skip double logging when using async output proc
                if finished_before and idx in finished_before:
                    actual_num_batched_tokens -= 1
                    continue

                # Currently, skip == preempted sequences, so we need to skip
                # their log stats
                if skip and idx in skip:
                    continue

                group_was_prefill = idx < scheduler_outputs.num_prefill_groups
                seq_group = scheduled_seq_group.seq_group

                # NOTE: a seq_group that completed all of its prefill tokens
                # in the last iteration will have seq_group.is_prefill() = False
                # with group_was_prefill = True
                if group_was_prefill:
                    # Number of prompt tokens.
                    num_prompt_tokens_iter += (
                        scheduled_seq_group.token_chunk_size)

                    # If the seq_group just finished the prefill state
                    # get TTFT.
                    if not seq_group.is_prefill():
                        latency = seq_group.get_last_token_latency()
                        time_to_first_tokens_iter.append(latency)

                        # One generation token per finished prefill.
                        num_generation_tokens_from_prefill_groups += (
                            seq_group.num_seqs())
                else:
                    # TPOTs.
                    latency = seq_group.get_last_token_latency()
                    time_per_output_tokens_iter.append(latency)
                    if seq_group.state.current_step == 0:
                        # For async_output_proc, the do_log_stats()
                        # is called following init_multi_step(), which
                        # sets the current_step to zero.
                        actual_num_batched_tokens +=\
                            seq_group.state.num_steps - 1
                    else:
                        actual_num_batched_tokens +=\
                            seq_group.state.current_step - 1

                # Because of chunked prefill, we can have a single sequence
                # group that does multiple prompt_runs. To prevent logging
                # the same metadata more than once per request, we standardize
                # on logging request level information for finished requests,
                # which can only happen once.
                if seq_group.is_finished():
                    # Latency timings
                    time_e2e_requests.append(now -
                                             seq_group.metrics.arrival_time)
                    if (seq_group.metrics.first_scheduled_time is not None and
                            seq_group.metrics.first_token_time is not None):
                        time_queue_requests.append(
                            seq_group.metrics.first_scheduled_time -
                            seq_group.metrics.arrival_time)
                        time_prefill_requests.append(
                            seq_group.metrics.first_token_time -
                            seq_group.metrics.first_scheduled_time)
                        time_decode_requests.append(
                            now - seq_group.metrics.first_token_time)
                        time_inference_requests.append(
                            now - seq_group.metrics.first_scheduled_time)
                    # Metadata
                    num_prompt_tokens_requests.append(
                        len(seq_group.prompt_token_ids))
                    num_generation_tokens_requests.extend([
                        seq.get_output_len()
                        for seq in seq_group.get_finished_seqs()
                    ])
                    max_num_generation_tokens_requests.append(
                        max(seq.get_output_len()
                            for seq in seq_group.get_seqs()))
                    if seq_group.sampling_params is not None:
                        n_requests.append(seq_group.sampling_params.n)
                        max_tokens_requests.append(
                            seq_group.sampling_params.max_tokens)
                    finished_reason_requests.extend([
                        SequenceStatus.get_finished_reason(seq.status)
                        for seq in seq_group.get_finished_seqs()
                    ])

            # Number of generation tokens.
            #   num_batched_tokens equals the number of prompt_tokens plus the
            #   number of decode_tokens in a single iteration. So,
            #   num_generation_tokens = num_batched_tokens - num_prompt_tokens
            #   + num_generation_tokens_from_prefill_groups (since we generate
            #   one token on prefills on iters where the prefill finishes).
            num_generation_tokens_iter = (
                actual_num_batched_tokens - num_prompt_tokens_iter +
                num_generation_tokens_from_prefill_groups)
            num_tokens_iter = (num_generation_tokens_iter +
                               num_prompt_tokens_iter)

        return Stats(
            now=now,
            # System stats
            #   Scheduler State
            num_running_sys=num_running_sys,
            num_swapped_sys=num_swapped_sys,
            num_waiting_sys=num_waiting_sys,
            #   KV Cache Usage in %
            gpu_cache_usage_sys=gpu_cache_usage_sys,
            cpu_cache_usage_sys=cpu_cache_usage_sys,
            #   Prefix Cache Hit Rate
            cpu_prefix_cache_hit_rate=cpu_prefix_cache_hit_rate,
            gpu_prefix_cache_hit_rate=gpu_prefix_cache_hit_rate,

            # Iteration stats
            num_prompt_tokens_iter=num_prompt_tokens_iter,
            num_generation_tokens_iter=num_generation_tokens_iter,
            num_tokens_iter=num_tokens_iter,
            time_to_first_tokens_iter=time_to_first_tokens_iter,
            time_per_output_tokens_iter=time_per_output_tokens_iter,
            num_preemption_iter=num_preemption_iter,

            # Request stats
            #   Latency
            time_e2e_requests=time_e2e_requests,
            time_queue_requests=time_queue_requests,
            time_inference_requests=time_inference_requests,
            time_prefill_requests=time_prefill_requests,
            time_decode_requests=time_decode_requests,
            #   Metadata
            num_prompt_tokens_requests=num_prompt_tokens_requests,
            num_generation_tokens_requests=num_generation_tokens_requests,
            max_num_generation_tokens_requests=
            max_num_generation_tokens_requests,
            n_requests=n_requests,
            max_tokens_requests=max_tokens_requests,
            finished_reason_requests=finished_reason_requests,
            max_lora=str(max_lora_stat),
            waiting_lora_adapters=list(waiting_lora_adapters.keys()),
            running_lora_adapters=list(running_lora_adapters.keys()))

    def add_lora(self, lora_request: LoRARequest) -> bool:
        return self.model_executor.add_lora(lora_request)

    def remove_lora(self, lora_id: int) -> bool:
        return self.model_executor.remove_lora(lora_id)

    def list_loras(self) -> Set[int]:
        return self.model_executor.list_loras()

    def pin_lora(self, lora_id: int) -> bool:
        return self.model_executor.pin_lora(lora_id)

    def start_profile(self) -> None:
        self.model_executor.start_profile()

    def stop_profile(self) -> None:
        self.model_executor.stop_profile()

    def sleep(self, level: int = 1) -> None:
        assert self.vllm_config.model_config.enable_sleep_mode, (
            "Sleep mode is not enabled in the model config")
        self.model_executor.sleep(level=level)

    def wake_up(self, tags: Optional[list[str]] = None) -> None:
        assert self.vllm_config.model_config.enable_sleep_mode, (
            "Sleep mode is not enabled in the model config")
        self.model_executor.wake_up(tags)

    def is_sleeping(self) -> bool:
        return self.model_executor.is_sleeping

    def check_health(self) -> None:
        self.model_executor.check_health()

    def is_tracing_enabled(self) -> bool:
        return self.tracer is not None

    def do_tracing(self,
                   scheduler_outputs: SchedulerOutputs,
                   finished_before: Optional[List[int]] = None) -> None:
        if self.tracer is None:
            return

        for idx, scheduled_seq_group in enumerate(
                scheduler_outputs.scheduled_seq_groups):
            # Skip double tracing when using async output proc
            if finished_before and idx in finished_before:
                continue

            seq_group = scheduled_seq_group.seq_group
            if seq_group.is_finished():
                self.create_trace_span(seq_group)

    def create_trace_span(self, seq_group: SequenceGroup) -> None:
        if self.tracer is None or seq_group.sampling_params is None:
            return
        arrival_time_nano_seconds = int(seq_group.metrics.arrival_time * 1e9)

        trace_context = extract_trace_context(seq_group.trace_headers)

        with self.tracer.start_as_current_span(
                "llm_request",
                kind=SpanKind.SERVER,
                context=trace_context,
                start_time=arrival_time_nano_seconds) as seq_span:
            metrics = seq_group.metrics

            # Handle potential None values for cancelled/aborted requests
            ttft = (metrics.first_token_time - metrics.arrival_time
                    if metrics.first_token_time is not None else None)

            e2e_time = (metrics.finished_time - metrics.arrival_time
                        if metrics.finished_time is not None else None)

            seq_span.set_attribute(SpanAttributes.GEN_AI_RESPONSE_MODEL,
                                   self.model_config.model)
            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID,
                                   seq_group.request_id)
            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE,
                                   seq_group.sampling_params.temperature)
            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P,
                                   seq_group.sampling_params.top_p)
            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS,
                                   seq_group.sampling_params.max_tokens)
            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N,
                                   seq_group.sampling_params.n)
            seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_NUM_SEQUENCES,
                                   seq_group.num_seqs())
            seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS,
                                   len(seq_group.prompt_token_ids))
            seq_span.set_attribute(
                SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
                sum([
                    seq.get_output_len()
                    for seq in seq_group.get_finished_seqs()
                ]))

            # Only set timing attributes if the values are available
            if metrics.time_in_queue is not None:
                seq_span.set_attribute(
                    SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
                    metrics.time_in_queue)
            if ttft is not None:
                seq_span.set_attribute(
                    SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, ttft)
            if e2e_time is not None:
                seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E,
                                       e2e_time)
            if metrics.scheduler_time is not None:
                seq_span.set_attribute(
                    SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER,
                    metrics.scheduler_time)
            if metrics.model_forward_time is not None:
                seq_span.set_attribute(
                    SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD,
                    metrics.model_forward_time / 1000.0)
            if metrics.model_execute_time is not None:
                seq_span.set_attribute(
                    SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE,
                    metrics.model_execute_time)

    def _validate_model_inputs(self, inputs: ProcessorInputs,
                               lora_request: Optional[LoRARequest]):
        encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)

        if encoder_inputs is not None:
            self._validate_model_input(encoder_inputs,
                                       lora_request,
                                       prompt_type="encoder")

        self._validate_model_input(decoder_inputs,
                                   lora_request,
                                   prompt_type="decoder")

    def _validate_model_input(
        self,
        prompt_inputs: SingletonInputs,
        lora_request: Optional[LoRARequest],
        *,
        prompt_type: Literal["encoder", "decoder"],
    ):
        model_config = self.model_config
        tokenizer = (None if self.tokenizer is None else
                     self.tokenizer.get_lora_tokenizer(lora_request))

        prompt_ids = prompt_inputs.get("prompt_token_ids", [])
        if not prompt_ids:
            if prompt_type == "encoder" and model_config.is_multimodal_model:
                pass  # Mllama may have empty encoder inputs for text-only data
            elif prompt_inputs["type"] == "embeds":
                pass
            else:
                raise ValueError(f"The {prompt_type} prompt cannot be empty")

        if tokenizer is not None:
            max_input_id = max(prompt_ids, default=0)
            if max_input_id > tokenizer.max_token_id:
                raise ValueError(
                    f"Token id {max_input_id} is out of vocabulary")

        max_prompt_len = self.model_config.max_model_len
        if len(prompt_ids) > max_prompt_len:
            if prompt_type == "encoder" and model_config.is_multimodal_model:
                mm_registry = self.input_preprocessor.mm_registry
                mm_processor = mm_registry.create_processor(
                    model_config,
                    tokenizer=tokenizer or object(),  # Dummy if no tokenizer
                )
                assert isinstance(mm_processor, EncDecMultiModalProcessor)

                if mm_processor.pad_dummy_encoder_prompt:
                    return  # Skip encoder length check for Whisper and Donut

            if model_config.is_multimodal_model:
                suggestion = (
                    "Make sure that `max_model_len` is no smaller than the "
                    "number of text tokens plus multimodal tokens. For image "
                    "inputs, the number of image tokens depends on the number "
                    "of images, and possibly their aspect ratios as well.")
            else:
                suggestion = (
                    "Make sure that `max_model_len` is no smaller than the "
                    "number of text tokens.")

            raise ValueError(
                f"The {prompt_type} prompt (length {len(prompt_ids)}) is "
                f"longer than the maximum model length of {max_prompt_len}. "
                f"{suggestion}")

            # TODO: Find out how many placeholder tokens are there so we can
            # check that chunked prefill does not truncate them
            # max_batch_len = self.scheduler_config.max_num_batched_tokens

    def _build_logits_processors(
            self, sampling_params: SamplingParams,
            lora_request: Optional[LoRARequest]) -> SamplingParams:
        """Constructs logits processors based on the logits_bias, and
        allowed_token_ids fields in sampling_params. Deletes those fields and
        adds the constructed logits processors to the logits_processors field.
        Returns the modified sampling params."""

        logits_processors = []

        if (sampling_params.logit_bias or sampling_params.allowed_token_ids):
            tokenizer = self.get_tokenizer(lora_request=lora_request)

            processors = get_openai_logits_processors(
                logit_bias=sampling_params.logit_bias,
                allowed_token_ids=sampling_params.allowed_token_ids,
                tokenizer=tokenizer)
            logits_processors.extend(processors)

            # Unset so these don't get passed down to the model
            sampling_params.logit_bias = None
            sampling_params.allowed_token_ids = None

        if len(sampling_params.bad_words) > 0:
            tokenizer = self.get_tokenizer(lora_request)
            processors = get_bad_words_logits_processors(
                bad_words=sampling_params.bad_words, tokenizer=tokenizer)
            logits_processors.extend(processors)

        if logits_processors:
            if sampling_params.logits_processors is None:
                sampling_params.logits_processors = logits_processors
            else:
                sampling_params.logits_processors.extend(logits_processors)

        return sampling_params

    def collective_rpc(self,
                       method: Union[str, Callable[..., _R]],
                       timeout: Optional[float] = None,
                       args: tuple = (),
                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
        return self.model_executor.collective_rpc(method, timeout, args,
                                                  kwargs)

DO_VALIDATE_OUTPUT class-attribute

DO_VALIDATE_OUTPUT: bool = False

A flag to toggle whether to validate the type of request output.

_skip_scheduling_next_step instance-attribute

_skip_scheduling_next_step = False

async_callbacks instance-attribute

async_callbacks = [
    (
        partial(
            process_model_outputs,
            ctx=scheduler_contexts[v_id],
        )
    )
    for v_id in (range(pipeline_parallel_size))
]

cache_config instance-attribute

cache_config = cache_config

cached_scheduler_outputs instance-attribute

cached_scheduler_outputs = [
    (SchedulerOutputState())
    for _ in (range(pipeline_parallel_size))
]

decoding_config instance-attribute

decoding_config = decoding_config or DecodingConfig()

detokenizer instance-attribute

detokenizer = None

device_config instance-attribute

device_config = device_config

generation_config_fields instance-attribute

generation_config_fields = try_get_generation_config()

input_preprocessor instance-attribute

input_preprocessor = InputPreprocessor(
    model_config, tokenizer, mm_registry
)

load_config instance-attribute

load_config = load_config

log_stats instance-attribute

log_stats = log_stats

lora_config instance-attribute

lora_config = lora_config

model_config instance-attribute

model_config = model_config

model_executor instance-attribute

model_executor = executor_class(vllm_config=vllm_config)

observability_config instance-attribute

observability_config = (
    observability_config or ObservabilityConfig()
)

output_processor instance-attribute

output_processor = create_output_processor(
    scheduler_config,
    detokenizer,
    scheduler,
    seq_counter,
    get_tokenizer_for_seq,
    stop_checker=StopChecker(
        max_model_len, get_tokenizer_for_seq
    ),
)

parallel_config instance-attribute

parallel_config = parallel_config

process_request_outputs_callback instance-attribute

process_request_outputs_callback: Optional[Callable] = None

scheduler instance-attribute

scheduler = [
    (
        Scheduler(
            scheduler_config,
            cache_config,
            lora_config,
            pipeline_parallel_size,
            async_callbacks[v_id]
            if use_async_output_proc
            else None,
        )
    )
    for v_id in (range(pipeline_parallel_size))
]

scheduler_config instance-attribute

scheduler_config = scheduler_config

scheduler_contexts instance-attribute

scheduler_contexts = [
    (SchedulerContext())
    for _ in (range(pipeline_parallel_size))
]

seq_counter instance-attribute

seq_counter = Counter()

seq_id_to_seq_group instance-attribute

seq_id_to_seq_group: Dict[str, SequenceGroupBase] = {}

speculative_config instance-attribute

speculative_config = speculative_config

stat_loggers instance-attribute

stat_loggers = stat_loggers

tokenizer instance-attribute

tracer instance-attribute

tracer = None

use_cached_outputs instance-attribute

use_cached_outputs = use_cached_outputs

vllm_config instance-attribute

vllm_config = vllm_config

__del__

__del__()
Source code in vllm/engine/llm_engine.py
def __del__(self):
    # Shutdown model executor when engine is garbage collected
    # Use getattr since __init__ can fail before the field is set
    if model_executor := getattr(self, "model_executor", None):
        model_executor.shutdown()

__init__

__init__(
    vllm_config: VllmConfig,
    executor_class: Type[ExecutorBase],
    log_stats: bool,
    usage_context: UsageContext = ENGINE_CONTEXT,
    stat_loggers: Optional[
        Dict[str, StatLoggerBase]
    ] = None,
    mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
    use_cached_outputs: bool = False,
) -> None
Source code in vllm/engine/llm_engine.py
def __init__(
    self,
    vllm_config: VllmConfig,
    executor_class: Type[ExecutorBase],
    log_stats: bool,
    usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
    stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
    mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
    use_cached_outputs: bool = False,
) -> None:
    if envs.VLLM_USE_V1:
        raise ValueError(
            "Using V0 LLMEngine, but envs.VLLM_USE_V1=True. "
            "This should not happen. As a workaround, try using "
            "LLMEngine.from_vllm_config(...) or explicitly set "
            "VLLM_USE_V1=0 or 1 and report this issue on Github.")

    self.vllm_config = vllm_config
    self.model_config = vllm_config.model_config
    self.cache_config = vllm_config.cache_config
    self.lora_config = vllm_config.lora_config
    self.parallel_config = vllm_config.parallel_config
    self.scheduler_config = vllm_config.scheduler_config
    self.device_config = vllm_config.device_config
    self.speculative_config = vllm_config.speculative_config  # noqa
    self.load_config = vllm_config.load_config
    self.decoding_config = vllm_config.decoding_config or DecodingConfig(  # noqa
    )
    self.observability_config = vllm_config.observability_config or ObservabilityConfig(  # noqa
    )

    logger.info(
        "Initializing a V0 LLM engine (v%s) with config: %s, "
        "use_cached_outputs=%s, ",
        VLLM_VERSION,
        vllm_config,
        use_cached_outputs,
    )

    self.log_stats = log_stats
    self.use_cached_outputs = use_cached_outputs

    if self.model_config.skip_tokenizer_init:
        self.tokenizer = None
        self.detokenizer = None
        tokenizer_group = None
    else:
        self.tokenizer = self._init_tokenizer()
        self.detokenizer = Detokenizer(self.tokenizer)
        tokenizer_group = self.get_tokenizer_group()

    # Ensure that the function doesn't contain a reference to self,
    # to avoid engine GC issues
    def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
        assert tokenizer_group, ("tokenizer_group cannot be None, "
                                 "make sure skip_tokenizer_init is False")
        return tokenizer_group.get_lora_tokenizer(sequence.lora_request)

    self.seq_counter = Counter()
    self.generation_config_fields = (
        self.model_config.try_get_generation_config())

    self.input_preprocessor = InputPreprocessor(self.model_config,
                                                self.tokenizer,
                                                mm_registry)

    self.model_executor = executor_class(vllm_config=vllm_config)

    if self.model_config.runner_type != "pooling":
        self._initialize_kv_caches()

    # If usage stat is enabled, collect relevant info.
    if is_usage_stats_enabled():
        from vllm.model_executor.model_loader import (
            get_architecture_class_name)
        usage_message.report_usage(
            get_architecture_class_name(self.model_config),
            usage_context,
            extra_kvs={
                # Common configuration
                "dtype":
                str(self.model_config.dtype),
                "tensor_parallel_size":
                self.parallel_config.tensor_parallel_size,
                "block_size":
                self.cache_config.block_size,
                "gpu_memory_utilization":
                self.cache_config.gpu_memory_utilization,

                # Quantization
                "quantization":
                self.model_config.quantization,
                "kv_cache_dtype":
                str(self.cache_config.cache_dtype),

                # Feature flags
                "enable_lora":
                bool(self.lora_config),
                "enable_prefix_caching":
                self.cache_config.enable_prefix_caching,
                "enforce_eager":
                self.model_config.enforce_eager,
                "disable_custom_all_reduce":
                self.parallel_config.disable_custom_all_reduce,
            })

    self.cached_scheduler_outputs = [
        SchedulerOutputState()
        for _ in range(self.parallel_config.pipeline_parallel_size)
    ]

    self.scheduler_contexts = [
        SchedulerContext()
        for _ in range(self.parallel_config.pipeline_parallel_size)
    ]

    if self.model_config.use_async_output_proc:
        process_model_outputs = weak_bind(self._process_model_outputs)

        self.async_callbacks = [
            partial(process_model_outputs,
                    ctx=self.scheduler_contexts[v_id])
            for v_id in range(self.parallel_config.pipeline_parallel_size)
        ]
    else:
        self.async_callbacks = []

    # Currently used by AsyncLLMEngine to ensure quick append
    # of request outputs to asyncio queues
    self.process_request_outputs_callback: Optional[Callable] = None

    # Create the scheduler.
    # NOTE: the cache_config here have been updated with the numbers of
    # GPU and CPU blocks, which are profiled in the distributed executor.
    if isinstance(self.vllm_config.scheduler_config.scheduler_cls, str):
        Scheduler = resolve_obj_by_qualname(
            self.vllm_config.scheduler_config.scheduler_cls)
    else:
        Scheduler = self.vllm_config.scheduler_config.scheduler_cls
    self.scheduler = [
        Scheduler(
            self.scheduler_config, self.cache_config, self.lora_config,
            self.parallel_config.pipeline_parallel_size,
            self.async_callbacks[v_id]
            if self.model_config.use_async_output_proc else None)
        for v_id in range(self.parallel_config.pipeline_parallel_size)
    ]

    # Metric Logging.
    if self.log_stats:
        if stat_loggers is not None:
            self.stat_loggers = stat_loggers
        else:
            # Lazy import for prometheus multiprocessing.
            # We need to set PROMETHEUS_MULTIPROC_DIR environment variable
            # before prometheus_client is imported.
            # See https://prometheus.github.io/client_python/multiprocess/
            from vllm.engine.metrics import (LoggingStatLogger,
                                             PrometheusStatLogger)

            self.stat_loggers = {
                "logging":
                LoggingStatLogger(
                    local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
                    vllm_config=vllm_config),
                "prometheus":
                PrometheusStatLogger(
                    local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
                    labels=dict(
                        model_name=self.model_config.served_model_name),
                    vllm_config=vllm_config),
            }
            self.stat_loggers["prometheus"].info("cache_config",
                                                 self.cache_config)

    self.tracer = None
    if self.observability_config.otlp_traces_endpoint:
        self.tracer = init_tracer(
            "vllm.llm_engine",
            self.observability_config.otlp_traces_endpoint)

    # Create sequence output processor, e.g. for beam search or
    # speculative decoding.
    self.output_processor = (
        SequenceGroupOutputProcessor.create_output_processor(
            self.scheduler_config,
            self.detokenizer,
            self.scheduler,
            self.seq_counter,
            get_tokenizer_for_seq,
            stop_checker=StopChecker(self.scheduler_config.max_model_len,
                                     get_tokenizer_for_seq),
        ))

    self.seq_id_to_seq_group: Dict[str, SequenceGroupBase] = {}

    # Flag to set when an input fails to process and the engine should run
    # the next step without re-scheduling.
    self._skip_scheduling_next_step = False

    # Don't keep the dummy data in memory
    self.reset_mm_cache()

__reduce__

__reduce__()
Source code in vllm/engine/llm_engine.py
def __reduce__(self):
    # This is to ensure that the LLMEngine is not referenced in
    # the closure used to initialize Ray worker actors
    raise RuntimeError("LLMEngine should not be pickled!")

_abort_and_cache_schedule

_abort_and_cache_schedule(
    request_id: str,
    virtual_engine: int,
    seq_group_metadata_list: List[SequenceGroupMetadata],
    scheduler_outputs: SchedulerOutputs,
    allow_async_output_proc: bool,
) -> None

Aborts a single request, and caches the scheduler outputs minus that request. This allows the next step to continue processing the remaining requests without having to re-run the scheduler.

Source code in vllm/engine/llm_engine.py
def _abort_and_cache_schedule(
        self, request_id: str, virtual_engine: int,
        seq_group_metadata_list: List[SequenceGroupMetadata],
        scheduler_outputs: SchedulerOutputs,
        allow_async_output_proc: bool) -> None:
    """Aborts a single request, and caches the scheduler outputs minus that
    request. This allows the next step to continue processing the remaining
    requests without having to re-run the scheduler."""

    # Abort the request and remove its sequence group from the current
    # schedule
    self.abort_request(request_id)
    for i, metadata in enumerate(seq_group_metadata_list):
        if metadata.request_id == request_id:
            del seq_group_metadata_list[i]
            break
    for i, group in enumerate(scheduler_outputs.scheduled_seq_groups):
        if group.seq_group.request_id == request_id:
            del scheduler_outputs.scheduled_seq_groups[i]
            break

    # If there are still other sequence groups left in the schedule, cache
    # them and flag the engine to reuse the schedule.
    if len(seq_group_metadata_list) > 0:
        self._skip_scheduling_next_step = True
        # Reuse multi-step caching logic
        self._cache_scheduler_outputs_for_multi_step(
            virtual_engine=virtual_engine,
            scheduler_outputs=scheduler_outputs,
            seq_group_metadata_list=seq_group_metadata_list,
            allow_async_output_proc=allow_async_output_proc)

_add_processed_request

_add_processed_request(
    request_id: str,
    processed_inputs: ProcessorInputs,
    params: Union[SamplingParams, PoolingParams],
    arrival_time: float,
    lora_request: Optional[LoRARequest],
    trace_headers: Optional[Mapping[str, str]] = None,
    priority: int = 0,
) -> Optional[SequenceGroup]

Add a processed request to the engine's request pool. return the created sequence group.

Source code in vllm/engine/llm_engine.py
def _add_processed_request(
    self,
    request_id: str,
    processed_inputs: ProcessorInputs,
    params: Union[SamplingParams, PoolingParams],
    arrival_time: float,
    lora_request: Optional[LoRARequest],
    trace_headers: Optional[Mapping[str, str]] = None,
    priority: int = 0,
) -> Optional[SequenceGroup]:
    """Add a processed request to the engine's request pool.
    return the created sequence group.
    """
    if isinstance(params, SamplingParams) and params.n > 1:
        ParallelSampleSequenceGroup.add_request(
            request_id,
            self,
            params,
            processed_inputs=processed_inputs,
            arrival_time=arrival_time,
            lora_request=lora_request,
            trace_headers=trace_headers,
            priority=priority,
        )
        return None

    self._validate_model_inputs(processed_inputs, lora_request)
    # Create the sequences.
    block_size = self.cache_config.block_size
    seq_id = next(self.seq_counter)
    eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)

    encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)

    seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id,
                   lora_request)

    encoder_seq = (None if encoder_inputs is None else Sequence(
        seq_id, encoder_inputs, block_size, eos_token_id, lora_request))

    # Create a SequenceGroup based on SamplingParams or PoolingParams
    if isinstance(params, SamplingParams):
        seq_group = self._create_sequence_group_with_sampling(
            request_id,
            seq,
            params,
            arrival_time=arrival_time,
            lora_request=lora_request,
            trace_headers=trace_headers,
            encoder_seq=encoder_seq,
            priority=priority)
    elif isinstance(params, PoolingParams):
        seq_group = self._create_sequence_group_with_pooling(
            request_id,
            seq,
            params,
            arrival_time=arrival_time,
            lora_request=lora_request,
            encoder_seq=encoder_seq,
            priority=priority)
    else:
        raise ValueError(
            "Either SamplingParams or PoolingParams must be provided.")

    # Add the sequence group to the scheduler with least unfinished seqs.
    costs = [
        scheduler.get_num_unfinished_seq_groups()
        for scheduler in self.scheduler
    ]
    min_cost_scheduler = self.scheduler[costs.index(min(costs))]
    min_cost_scheduler.add_seq_group(seq_group)

    return seq_group

_advance_to_next_step

_advance_to_next_step(
    output: SamplerOutput,
    seq_group_metadata_list: List[SequenceGroupMetadata],
    scheduled_seq_groups: List[ScheduledSequenceGroup],
) -> None

Given model output from a single run, append the tokens to the sequences. This is normally done inside output processor, but it is required if the worker is to perform async forward pass to next step.

Source code in vllm/engine/llm_engine.py
def _advance_to_next_step(
        self, output: SamplerOutput,
        seq_group_metadata_list: List[SequenceGroupMetadata],
        scheduled_seq_groups: List[ScheduledSequenceGroup]) -> None:
    """Given model output from a single run, append the tokens to the
    sequences. This is normally done inside output processor, but it is
    required if the worker is to perform async forward pass to next step.
    """
    for seq_group_metadata, sequence_group_outputs, scheduled_seq_group in \
        zip(seq_group_metadata_list, output, scheduled_seq_groups):
        seq_group = scheduled_seq_group.seq_group

        if seq_group.is_finished():
            continue

        token_chunk_size = (seq_group_metadata.token_chunk_size
                            if seq_group_metadata.token_chunk_size
                            is not None else 0)
        seq_group.update_num_computed_tokens(token_chunk_size)

        if seq_group_metadata.do_sample:
            assert len(sequence_group_outputs.samples) == 1, (
                "Async output processor expects a single sample"
                " (i.e sampling_params.n == 1)")
            sample = sequence_group_outputs.samples[0]

            assert len(seq_group.seqs) == 1
            seq = seq_group.seqs[0]

            seq.append_token_id(sample.output_token, sample.logprobs,
                                sample.output_embed)

_build_logits_processors

_build_logits_processors(
    sampling_params: SamplingParams,
    lora_request: Optional[LoRARequest],
) -> SamplingParams

Constructs logits processors based on the logits_bias, and allowed_token_ids fields in sampling_params. Deletes those fields and adds the constructed logits processors to the logits_processors field. Returns the modified sampling params.

Source code in vllm/engine/llm_engine.py
def _build_logits_processors(
        self, sampling_params: SamplingParams,
        lora_request: Optional[LoRARequest]) -> SamplingParams:
    """Constructs logits processors based on the logits_bias, and
    allowed_token_ids fields in sampling_params. Deletes those fields and
    adds the constructed logits processors to the logits_processors field.
    Returns the modified sampling params."""

    logits_processors = []

    if (sampling_params.logit_bias or sampling_params.allowed_token_ids):
        tokenizer = self.get_tokenizer(lora_request=lora_request)

        processors = get_openai_logits_processors(
            logit_bias=sampling_params.logit_bias,
            allowed_token_ids=sampling_params.allowed_token_ids,
            tokenizer=tokenizer)
        logits_processors.extend(processors)

        # Unset so these don't get passed down to the model
        sampling_params.logit_bias = None
        sampling_params.allowed_token_ids = None

    if len(sampling_params.bad_words) > 0:
        tokenizer = self.get_tokenizer(lora_request)
        processors = get_bad_words_logits_processors(
            bad_words=sampling_params.bad_words, tokenizer=tokenizer)
        logits_processors.extend(processors)

    if logits_processors:
        if sampling_params.logits_processors is None:
            sampling_params.logits_processors = logits_processors
        else:
            sampling_params.logits_processors.extend(logits_processors)

    return sampling_params

_cache_scheduler_outputs_for_multi_step

_cache_scheduler_outputs_for_multi_step(
    virtual_engine: int,
    seq_group_metadata_list: Optional[
        List[SequenceGroupMetadata]
    ],
    scheduler_outputs: SchedulerOutputs,
    allow_async_output_proc: bool,
) -> None
Source code in vllm/engine/llm_engine.py
def _cache_scheduler_outputs_for_multi_step(
        self, virtual_engine: int,
        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
        scheduler_outputs: SchedulerOutputs,
        allow_async_output_proc: bool) -> None:
    co = self.cached_scheduler_outputs[virtual_engine]

    co.seq_group_metadata_list = seq_group_metadata_list
    co.scheduler_outputs = scheduler_outputs
    co.allow_async_output_proc = allow_async_output_proc
    co.last_output = None

_create_sequence_group_with_pooling

_create_sequence_group_with_pooling(
    request_id: str,
    seq: Sequence,
    pooling_params: PoolingParams,
    arrival_time: float,
    lora_request: Optional[LoRARequest],
    encoder_seq: Optional[Sequence] = None,
    priority: int = 0,
) -> SequenceGroup

Creates a SequenceGroup with PoolingParams.

Source code in vllm/engine/llm_engine.py
def _create_sequence_group_with_pooling(
    self,
    request_id: str,
    seq: Sequence,
    pooling_params: PoolingParams,
    arrival_time: float,
    lora_request: Optional[LoRARequest],
    encoder_seq: Optional[Sequence] = None,
    priority: int = 0,
) -> SequenceGroup:
    """Creates a SequenceGroup with PoolingParams."""
    # Defensive copy of PoolingParams, which are used by the pooler
    pooling_params = pooling_params.clone()
    # Create the sequence group.
    seq_group = SequenceGroup(request_id=request_id,
                              seqs=[seq],
                              arrival_time=arrival_time,
                              lora_request=lora_request,
                              pooling_params=pooling_params,
                              encoder_seq=encoder_seq,
                              priority=priority)
    return seq_group

_create_sequence_group_with_sampling

_create_sequence_group_with_sampling(
    request_id: str,
    seq: Sequence,
    sampling_params: SamplingParams,
    arrival_time: float,
    lora_request: Optional[LoRARequest],
    trace_headers: Optional[Mapping[str, str]] = None,
    encoder_seq: Optional[Sequence] = None,
    priority: int = 0,
) -> SequenceGroup

Creates a SequenceGroup with SamplingParams.

Source code in vllm/engine/llm_engine.py
def _create_sequence_group_with_sampling(
    self,
    request_id: str,
    seq: Sequence,
    sampling_params: SamplingParams,
    arrival_time: float,
    lora_request: Optional[LoRARequest],
    trace_headers: Optional[Mapping[str, str]] = None,
    encoder_seq: Optional[Sequence] = None,
    priority: int = 0,
) -> SequenceGroup:
    """Creates a SequenceGroup with SamplingParams."""
    max_logprobs = self.get_model_config().max_logprobs
    if (sampling_params.logprobs
            and sampling_params.logprobs > max_logprobs) or (
                sampling_params.prompt_logprobs
                and sampling_params.prompt_logprobs > max_logprobs):
        raise ValueError(f"Cannot request more than "
                         f"{max_logprobs} logprobs.")

    sampling_params = self._build_logits_processors(
        sampling_params, lora_request)

    # Defensive copy of SamplingParams, which are used by the sampler,
    # this doesn't deep-copy LogitsProcessor objects
    sampling_params = sampling_params.clone()

    sampling_params.update_from_generation_config(
        self.generation_config_fields, seq.eos_token_id)

    # Create the sequence group.
    draft_size = 1
    if self.vllm_config.speculative_config is not None:
        draft_size = \
            self.vllm_config.speculative_config.num_speculative_tokens + 1
    seq_group = SequenceGroup(request_id=request_id,
                              seqs=[seq],
                              arrival_time=arrival_time,
                              sampling_params=sampling_params,
                              lora_request=lora_request,
                              trace_headers=trace_headers,
                              encoder_seq=encoder_seq,
                              priority=priority,
                              draft_size=draft_size)

    return seq_group

_get_executor_cls classmethod

_get_executor_cls(
    engine_config: VllmConfig,
) -> Type[ExecutorBase]
Source code in vllm/engine/llm_engine.py
@classmethod
def _get_executor_cls(cls,
                      engine_config: VllmConfig) -> Type[ExecutorBase]:
    # distributed_executor_backend must be set in VllmConfig.__post_init__
    distributed_executor_backend = (
        engine_config.parallel_config.distributed_executor_backend)
    # Initialize the cluster and specify the executor class.
    if isinstance(distributed_executor_backend, type):
        if not issubclass(distributed_executor_backend, ExecutorBase):
            raise TypeError(
                "distributed_executor_backend must be a subclass of "
                f"ExecutorBase. Got {distributed_executor_backend}.")
        executor_class = distributed_executor_backend
    elif distributed_executor_backend == "ray":
        from vllm.executor.ray_distributed_executor import (
            RayDistributedExecutor)
        executor_class = RayDistributedExecutor
    elif distributed_executor_backend == "mp":
        from vllm.executor.mp_distributed_executor import (
            MultiprocessingDistributedExecutor)
        assert not envs.VLLM_USE_RAY_SPMD_WORKER, (
            "multiprocessing distributed executor backend does not "
            "support VLLM_USE_RAY_SPMD_WORKER=1")
        executor_class = MultiprocessingDistributedExecutor
    elif distributed_executor_backend == "uni":
        # JAX-style, single-process, multi-device executor.
        from vllm.executor.uniproc_executor import UniProcExecutor
        executor_class = UniProcExecutor
    elif distributed_executor_backend == "external_launcher":
        # executor with external launcher
        from vllm.executor.uniproc_executor import (  # noqa
            ExecutorWithExternalLauncher)
        executor_class = ExecutorWithExternalLauncher
    else:
        raise ValueError("unrecognized distributed_executor_backend: "
                         f"{distributed_executor_backend}")
    return executor_class

_get_last_sampled_token_ids

_get_last_sampled_token_ids(
    virtual_engine: int,
) -> Optional[Tensor]
Source code in vllm/engine/llm_engine.py
def _get_last_sampled_token_ids(
        self, virtual_engine: int) -> Optional[torch.Tensor]:
    return None

_get_stats

_get_stats(
    scheduler_outputs: Optional[SchedulerOutputs],
    model_output: Optional[List[SamplerOutput]] = None,
    finished_before: Optional[List[int]] = None,
    skip: Optional[List[int]] = None,
) -> Stats

Get Stats to be Logged to Prometheus.

Parameters:

Name Type Description Default
scheduler_outputs Optional[SchedulerOutputs]

Optional, used to populate metrics related to the scheduled batch,

required
model_output Optional[List[SamplerOutput]]

Optional, used to emit speculative decoding metrics which are created by the workers.

None
finished_before Optional[List[int]]

Optional, indices of sequences that were finished before. These sequences will be ignored.

None
skip Optional[List[int]]

Optional, indices of sequences that were preempted. These sequences will be ignored.

None
Source code in vllm/engine/llm_engine.py
def _get_stats(self,
               scheduler_outputs: Optional[SchedulerOutputs],
               model_output: Optional[List[SamplerOutput]] = None,
               finished_before: Optional[List[int]] = None,
               skip: Optional[List[int]] = None) -> Stats:
    """Get Stats to be Logged to Prometheus.

    Args:
        scheduler_outputs: Optional, used to populate metrics related to
            the scheduled batch,
        model_output: Optional, used to emit speculative decoding metrics
            which are created by the workers.
        finished_before: Optional, indices of sequences that were finished
            before. These sequences will be ignored.
        skip: Optional, indices of sequences that were preempted. These
            sequences will be ignored.
    """
    now = time.time()

    # System State
    #   Scheduler State
    num_running_sys = sum(
        len(scheduler.running) for scheduler in self.scheduler)
    num_swapped_sys = sum(
        len(scheduler.swapped) for scheduler in self.scheduler)
    num_waiting_sys = sum(
        len(scheduler.waiting) for scheduler in self.scheduler)

    # KV Cache Usage in %
    num_total_gpu = self.cache_config.num_gpu_blocks
    gpu_cache_usage_sys = 0.
    if num_total_gpu:  # Guard against both None and 0
        num_free_gpu = sum(
            scheduler.block_manager.get_num_free_gpu_blocks()
            for scheduler in self.scheduler)
        gpu_cache_usage_sys = 1.0 - (num_free_gpu / num_total_gpu)

    num_total_cpu = self.cache_config.num_cpu_blocks
    cpu_cache_usage_sys = 0.
    if num_total_cpu:  # Guard against both None and 0
        num_free_cpu = sum(
            scheduler.block_manager.get_num_free_cpu_blocks()
            for scheduler in self.scheduler)
        cpu_cache_usage_sys = 1.0 - (num_free_cpu / num_total_cpu)

    # Prefix Cache Hit Rate. Note that we always use
    # the cache hit rate of the first virtual engine.
    cpu_prefix_cache_hit_rate = self.scheduler[
        0].get_prefix_cache_hit_rate(Device.CPU)
    gpu_prefix_cache_hit_rate = self.scheduler[
        0].get_prefix_cache_hit_rate(Device.GPU)

    # Exchange the uasge and cache hit stats between gpu and cpu when
    # running on cpu because the cpu_worker.py intentionally reports the
    # number of cpu blocks as gpu blocks in favor of cache management.
    if self.device_config.device_type == "cpu":
        num_total_gpu, num_total_cpu = num_total_cpu, num_total_gpu
        gpu_cache_usage_sys, cpu_cache_usage_sys = (
            cpu_cache_usage_sys,
            gpu_cache_usage_sys,
        )
        gpu_prefix_cache_hit_rate, cpu_prefix_cache_hit_rate = (
            cpu_prefix_cache_hit_rate,
            gpu_prefix_cache_hit_rate,
        )

    # Iteration stats
    num_prompt_tokens_iter = 0
    num_generation_tokens_iter = 0
    num_tokens_iter = 0
    time_to_first_tokens_iter: List[float] = []
    time_per_output_tokens_iter: List[float] = []
    num_preemption_iter = (0 if scheduler_outputs is None else
                           scheduler_outputs.preempted)

    # Request stats
    #   Latency
    time_e2e_requests: List[float] = []
    time_queue_requests: List[float] = []
    time_inference_requests: List[float] = []
    time_prefill_requests: List[float] = []
    time_decode_requests: List[float] = []
    #   Metadata
    num_prompt_tokens_requests: List[int] = []
    num_generation_tokens_requests: List[int] = []
    n_requests: List[int] = []
    max_num_generation_tokens_requests: List[int] = []
    max_tokens_requests: List[int] = []
    finished_reason_requests: List[str] = []

    # LoRA requests
    running_lora_adapters = dict(
        collectionsCounter([
            running_request.lora_request.lora_name
            for scheduler in self.scheduler
            for running_request in scheduler.running
            if running_request.lora_request
        ]))
    waiting_lora_adapters = dict(
        collectionsCounter([
            waiting_request.lora_request.lora_name
            for scheduler in self.scheduler
            for waiting_request in scheduler.waiting
            if waiting_request.lora_request
        ]))
    max_lora_stat = "0"
    if self.lora_config:
        max_lora_stat = str(self.lora_config.max_loras)

    # NOTE: This loop assumes prefill seq_groups are before
    # decode seq_groups in scheduled_seq_groups.
    if scheduler_outputs is not None:
        # For async postprocessor, already finished sequences need to be
        # not counted (to avoid double counting)
        actual_num_batched_tokens = scheduler_outputs.num_batched_tokens  # type: ignore

        num_generation_tokens_from_prefill_groups = 0
        # NOTE: if scheduler_outputs.num_prefill_groups > 0 and
        # the len of scheduler_outputs.scheduled_seq_groups is !=
        # scheduler_outputs.num_prefill_groups, this means that
        # chunked prefills have been detected.

        for idx, scheduled_seq_group in enumerate(
                scheduler_outputs.scheduled_seq_groups):
            # Skip double logging when using async output proc
            if finished_before and idx in finished_before:
                actual_num_batched_tokens -= 1
                continue

            # Currently, skip == preempted sequences, so we need to skip
            # their log stats
            if skip and idx in skip:
                continue

            group_was_prefill = idx < scheduler_outputs.num_prefill_groups
            seq_group = scheduled_seq_group.seq_group

            # NOTE: a seq_group that completed all of its prefill tokens
            # in the last iteration will have seq_group.is_prefill() = False
            # with group_was_prefill = True
            if group_was_prefill:
                # Number of prompt tokens.
                num_prompt_tokens_iter += (
                    scheduled_seq_group.token_chunk_size)

                # If the seq_group just finished the prefill state
                # get TTFT.
                if not seq_group.is_prefill():
                    latency = seq_group.get_last_token_latency()
                    time_to_first_tokens_iter.append(latency)

                    # One generation token per finished prefill.
                    num_generation_tokens_from_prefill_groups += (
                        seq_group.num_seqs())
            else:
                # TPOTs.
                latency = seq_group.get_last_token_latency()
                time_per_output_tokens_iter.append(latency)
                if seq_group.state.current_step == 0:
                    # For async_output_proc, the do_log_stats()
                    # is called following init_multi_step(), which
                    # sets the current_step to zero.
                    actual_num_batched_tokens +=\
                        seq_group.state.num_steps - 1
                else:
                    actual_num_batched_tokens +=\
                        seq_group.state.current_step - 1

            # Because of chunked prefill, we can have a single sequence
            # group that does multiple prompt_runs. To prevent logging
            # the same metadata more than once per request, we standardize
            # on logging request level information for finished requests,
            # which can only happen once.
            if seq_group.is_finished():
                # Latency timings
                time_e2e_requests.append(now -
                                         seq_group.metrics.arrival_time)
                if (seq_group.metrics.first_scheduled_time is not None and
                        seq_group.metrics.first_token_time is not None):
                    time_queue_requests.append(
                        seq_group.metrics.first_scheduled_time -
                        seq_group.metrics.arrival_time)
                    time_prefill_requests.append(
                        seq_group.metrics.first_token_time -
                        seq_group.metrics.first_scheduled_time)
                    time_decode_requests.append(
                        now - seq_group.metrics.first_token_time)
                    time_inference_requests.append(
                        now - seq_group.metrics.first_scheduled_time)
                # Metadata
                num_prompt_tokens_requests.append(
                    len(seq_group.prompt_token_ids))
                num_generation_tokens_requests.extend([
                    seq.get_output_len()
                    for seq in seq_group.get_finished_seqs()
                ])
                max_num_generation_tokens_requests.append(
                    max(seq.get_output_len()
                        for seq in seq_group.get_seqs()))
                if seq_group.sampling_params is not None:
                    n_requests.append(seq_group.sampling_params.n)
                    max_tokens_requests.append(
                        seq_group.sampling_params.max_tokens)
                finished_reason_requests.extend([
                    SequenceStatus.get_finished_reason(seq.status)
                    for seq in seq_group.get_finished_seqs()
                ])

        # Number of generation tokens.
        #   num_batched_tokens equals the number of prompt_tokens plus the
        #   number of decode_tokens in a single iteration. So,
        #   num_generation_tokens = num_batched_tokens - num_prompt_tokens
        #   + num_generation_tokens_from_prefill_groups (since we generate
        #   one token on prefills on iters where the prefill finishes).
        num_generation_tokens_iter = (
            actual_num_batched_tokens - num_prompt_tokens_iter +
            num_generation_tokens_from_prefill_groups)
        num_tokens_iter = (num_generation_tokens_iter +
                           num_prompt_tokens_iter)

    return Stats(
        now=now,
        # System stats
        #   Scheduler State
        num_running_sys=num_running_sys,
        num_swapped_sys=num_swapped_sys,
        num_waiting_sys=num_waiting_sys,
        #   KV Cache Usage in %
        gpu_cache_usage_sys=gpu_cache_usage_sys,
        cpu_cache_usage_sys=cpu_cache_usage_sys,
        #   Prefix Cache Hit Rate
        cpu_prefix_cache_hit_rate=cpu_prefix_cache_hit_rate,
        gpu_prefix_cache_hit_rate=gpu_prefix_cache_hit_rate,

        # Iteration stats
        num_prompt_tokens_iter=num_prompt_tokens_iter,
        num_generation_tokens_iter=num_generation_tokens_iter,
        num_tokens_iter=num_tokens_iter,
        time_to_first_tokens_iter=time_to_first_tokens_iter,
        time_per_output_tokens_iter=time_per_output_tokens_iter,
        num_preemption_iter=num_preemption_iter,

        # Request stats
        #   Latency
        time_e2e_requests=time_e2e_requests,
        time_queue_requests=time_queue_requests,
        time_inference_requests=time_inference_requests,
        time_prefill_requests=time_prefill_requests,
        time_decode_requests=time_decode_requests,
        #   Metadata
        num_prompt_tokens_requests=num_prompt_tokens_requests,
        num_generation_tokens_requests=num_generation_tokens_requests,
        max_num_generation_tokens_requests=
        max_num_generation_tokens_requests,
        n_requests=n_requests,
        max_tokens_requests=max_tokens_requests,
        finished_reason_requests=finished_reason_requests,
        max_lora=str(max_lora_stat),
        waiting_lora_adapters=list(waiting_lora_adapters.keys()),
        running_lora_adapters=list(running_lora_adapters.keys()))

_has_remaining_steps

_has_remaining_steps(
    seq_group_metadata_list: Optional[
        List[SequenceGroupMetadata]
    ],
) -> bool
Source code in vllm/engine/llm_engine.py
def _has_remaining_steps(
    self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]]
) -> bool:
    return False

_init_tokenizer

_init_tokenizer() -> TokenizerGroup
Source code in vllm/engine/llm_engine.py
def _init_tokenizer(self) -> TokenizerGroup:
    return init_tokenizer_from_configs(
        model_config=self.model_config,
        scheduler_config=self.scheduler_config,
        lora_config=self.lora_config)

_initialize_kv_caches

_initialize_kv_caches() -> None

Initialize the KV cache in the worker(s).

The workers will determine the number of blocks in both the GPU cache and the swap CPU cache.

Source code in vllm/engine/llm_engine.py
def _initialize_kv_caches(self) -> None:
    """Initialize the KV cache in the worker(s).

    The workers will determine the number of blocks in both the GPU cache
    and the swap CPU cache.
    """
    start = time.time()
    num_gpu_blocks, num_cpu_blocks = (
        self.model_executor.determine_num_available_blocks())

    if self.cache_config.num_gpu_blocks_override is not None:
        num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override
        logger.info(
            "Overriding num_gpu_blocks=%d with "
            "num_gpu_blocks_override=%d", num_gpu_blocks,
            num_gpu_blocks_override)
        num_gpu_blocks = num_gpu_blocks_override

    self.cache_config.num_gpu_blocks = num_gpu_blocks
    self.cache_config.num_cpu_blocks = num_cpu_blocks

    self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
    elapsed = time.time() - start
    logger.info(("init engine (profile, create kv cache, "
                 "warmup model) took %.2f seconds"), elapsed)

_process_model_outputs

_process_model_outputs(
    ctx: SchedulerContext, request_id: Optional[str] = None
) -> None

Apply the model output to the sequences in the scheduled seq groups and return responses.

ctx: The virtual engine context to work on request_id: If provided, then only this request is going to be processed

Source code in vllm/engine/llm_engine.py
def _process_model_outputs(self,
                           ctx: SchedulerContext,
                           request_id: Optional[str] = None) -> None:
    """Apply the model output to the sequences in the scheduled seq groups
    and return responses.

    ctx: The virtual engine context to work on
    request_id: If provided, then only this request is going to be processed
    """

    now = time.time()

    if len(ctx.output_queue) == 0:
        return None

    # Get pending async postprocessor
    if request_id:
        # When we process only one request, no pop is required
        # (since later we will process all of the rest)
        (outputs, seq_group_metadata_list, scheduler_outputs, is_async,
         is_last_step, is_first_step_output, skip) = ctx.output_queue[0]
    else:
        (outputs, seq_group_metadata_list, scheduler_outputs, is_async,
         is_last_step, is_first_step_output,
         skip) = ctx.output_queue.popleft()

    # Sanity check
    assert len(seq_group_metadata_list) == len(
        scheduler_outputs.scheduled_seq_groups)

    has_multiple_outputs: bool = len(outputs) > 1
    outputs_by_sequence_group: List[List[SequenceGroupOutput]]
    assert not has_multiple_outputs
    outputs_by_sequence_group = outputs

    # Determine the requests we need to operate on
    if request_id:
        indices = []
        for i, seq_group_meta in enumerate(seq_group_metadata_list):
            if seq_group_meta.request_id == request_id:
                assert i not in skip  # Cannot be called twice
                indices.append(i)
                break

        # If the request_id was not found, then it means that
        # this is a new request that has no pending async
        # postprocessor
        if not indices:
            return
    else:
        indices = range(len(seq_group_metadata_list))  # type: ignore

    finished_before: List[int] = []
    finished_now: List[int] = []
    for i in indices:
        if i in skip:
            continue

        seq_group_meta = seq_group_metadata_list[i]
        scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]

        seq_group: SequenceGroup = scheduled_seq_group.seq_group

        if seq_group.is_finished():
            finished_before.append(i)
            continue

        output: List[SequenceGroupOutput]
        if has_multiple_outputs:
            output = outputs_by_sequence_group[i]
        else:
            output = [outputs_by_sequence_group[0][i]]

        if not is_async:
            seq_group.update_num_computed_tokens(
                seq_group_meta.token_chunk_size or 0)

        if outputs:
            for o in outputs:
                if (isinstance(o, SamplerOutput)
                        and seq_group.metrics is not None):
                    if seq_group.metrics.model_forward_time is not None:
                        seq_group.metrics.model_forward_time += (
                            o.model_forward_time or 0)
                    else:
                        seq_group.metrics.model_forward_time = (
                            o.model_forward_time)
                    if seq_group.metrics.model_execute_time is not None:
                        seq_group.metrics.model_execute_time += (
                            o.model_execute_time or 0)
                    else:
                        seq_group.metrics.model_execute_time = (
                            o.model_execute_time)

        if self.model_config.runner_type == "pooling":
            self._process_sequence_group_outputs(seq_group, output)
        else:
            self.output_processor.process_prompt_logprob(seq_group, output)
            if seq_group_meta.do_sample:
                self.output_processor.process_outputs(
                    seq_group, output, is_async)

        if seq_group.is_finished():
            finished_now.append(i)

    # Generate outputs for the requests that finished this iteration
    for i in finished_now:
        scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]

        seq_group = scheduled_seq_group.seq_group
        seq_group.maybe_set_first_token_time(now)
        if not seq_group.is_prefill():
            seq_group.set_last_token_time(now)
        request_output = RequestOutputFactory.create(
            seq_group,
            self.seq_id_to_seq_group,
            use_cache=self.use_cached_outputs)
        if request_output:
            ctx.request_outputs.append(request_output)

    # When we process a single request, we skip it for the next time,
    # and invoke the request output callback (if there was final output)
    if request_id:
        assert len(indices) == 1
        skip.append(indices[0])

        if (finished_now
                and self.process_request_outputs_callback is not None):
            self.process_request_outputs_callback(ctx.request_outputs)
            ctx.request_outputs.clear()
        return

    # Free currently finished requests
    if finished_now:
        for scheduler in self.scheduler:
            scheduler.free_finished_seq_groups()

    # Create the outputs
    for i in indices:
        if i in skip or i in finished_before or i in finished_now:
            continue  # Avoids double processing

        scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]

        seq_group = scheduled_seq_group.seq_group
        seq_group.maybe_set_first_token_time(now)
        if not seq_group.is_prefill():
            seq_group.set_last_token_time(now)
        request_output = RequestOutputFactory.create(
            seq_group,
            self.seq_id_to_seq_group,
            use_cache=self.use_cached_outputs)
        if request_output:
            ctx.request_outputs.append(request_output)

    # Create outputs only after processing the scheduler's results

    for seq_group in scheduler_outputs.ignored_seq_groups:
        params = seq_group.sampling_params
        if params is not None and params.output_kind == (
                RequestOutputKind.DELTA) and not seq_group.is_finished():
            continue

        request_output = RequestOutputFactory.create(
            seq_group,
            self.seq_id_to_seq_group,
            use_cache=self.use_cached_outputs,
        )
        if request_output:
            ctx.request_outputs.append(request_output)

    # Immediately process request outputs here (if callback is given)
    if (ctx.request_outputs
            and self.process_request_outputs_callback is not None):
        self.process_request_outputs_callback(ctx.request_outputs)
        ctx.request_outputs.clear()

    # For async case, we need to record the stats here.
    # For non-async case, the stats are done in the
    # LLMEngine/AsyncLLMEngine directly
    if is_async:
        # Log stats.
        self.do_log_stats(scheduler_outputs, outputs, finished_before,
                          skip)

        # Tracing
        self.do_tracing(scheduler_outputs, finished_before)

    return None

_process_sequence_group_outputs staticmethod

_process_sequence_group_outputs(
    seq_group: SequenceGroup,
    outputs: List[PoolingSequenceGroupOutput],
) -> None
Source code in vllm/engine/llm_engine.py
@staticmethod
def _process_sequence_group_outputs(
    seq_group: SequenceGroup,
    outputs: List[PoolingSequenceGroupOutput],
) -> None:
    seq_group.pooled_data = outputs[0].data

    for seq in seq_group.get_seqs():
        seq.status = SequenceStatus.FINISHED_STOPPED

    return

_update_cached_scheduler_output

_update_cached_scheduler_output(
    virtual_engine: int,
    output: List[Optional[SamplerOutput]],
) -> None
Source code in vllm/engine/llm_engine.py
def _update_cached_scheduler_output(
        self, virtual_engine: int,
        output: List[Optional[SamplerOutput]]) -> None:
    if (self.parallel_config.pipeline_parallel_size > 1 and len(output) > 0
            and output[0] is not None):
        last_output = output[-1]
        assert last_output is not None
        assert last_output.sampled_token_ids_cpu is not None
        assert last_output.sampled_token_ids is None
        assert last_output.sampled_token_probs is None
        self.cached_scheduler_outputs[
            virtual_engine].last_output = last_output

_validate_model_input

_validate_model_input(
    prompt_inputs: SingletonInputs,
    lora_request: Optional[LoRARequest],
    *,
    prompt_type: Literal["encoder", "decoder"],
)
Source code in vllm/engine/llm_engine.py
def _validate_model_input(
    self,
    prompt_inputs: SingletonInputs,
    lora_request: Optional[LoRARequest],
    *,
    prompt_type: Literal["encoder", "decoder"],
):
    model_config = self.model_config
    tokenizer = (None if self.tokenizer is None else
                 self.tokenizer.get_lora_tokenizer(lora_request))

    prompt_ids = prompt_inputs.get("prompt_token_ids", [])
    if not prompt_ids:
        if prompt_type == "encoder" and model_config.is_multimodal_model:
            pass  # Mllama may have empty encoder inputs for text-only data
        elif prompt_inputs["type"] == "embeds":
            pass
        else:
            raise ValueError(f"The {prompt_type} prompt cannot be empty")

    if tokenizer is not None:
        max_input_id = max(prompt_ids, default=0)
        if max_input_id > tokenizer.max_token_id:
            raise ValueError(
                f"Token id {max_input_id} is out of vocabulary")

    max_prompt_len = self.model_config.max_model_len
    if len(prompt_ids) > max_prompt_len:
        if prompt_type == "encoder" and model_config.is_multimodal_model:
            mm_registry = self.input_preprocessor.mm_registry
            mm_processor = mm_registry.create_processor(
                model_config,
                tokenizer=tokenizer or object(),  # Dummy if no tokenizer
            )
            assert isinstance(mm_processor, EncDecMultiModalProcessor)

            if mm_processor.pad_dummy_encoder_prompt:
                return  # Skip encoder length check for Whisper and Donut

        if model_config.is_multimodal_model:
            suggestion = (
                "Make sure that `max_model_len` is no smaller than the "
                "number of text tokens plus multimodal tokens. For image "
                "inputs, the number of image tokens depends on the number "
                "of images, and possibly their aspect ratios as well.")
        else:
            suggestion = (
                "Make sure that `max_model_len` is no smaller than the "
                "number of text tokens.")

        raise ValueError(
            f"The {prompt_type} prompt (length {len(prompt_ids)}) is "
            f"longer than the maximum model length of {max_prompt_len}. "
            f"{suggestion}")

_validate_model_inputs

_validate_model_inputs(
    inputs: ProcessorInputs,
    lora_request: Optional[LoRARequest],
)
Source code in vllm/engine/llm_engine.py
def _validate_model_inputs(self, inputs: ProcessorInputs,
                           lora_request: Optional[LoRARequest]):
    encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)

    if encoder_inputs is not None:
        self._validate_model_input(encoder_inputs,
                                   lora_request,
                                   prompt_type="encoder")

    self._validate_model_input(decoder_inputs,
                               lora_request,
                               prompt_type="decoder")

_verify_args

_verify_args() -> None
Source code in vllm/engine/llm_engine.py
def _verify_args(self) -> None:
    self.model_config.verify_with_parallel_config(self.parallel_config)
    self.cache_config.verify_with_parallel_config(self.parallel_config)
    if self.lora_config:
        self.lora_config.verify_with_model_config(self.model_config)
        self.lora_config.verify_with_scheduler_config(
            self.scheduler_config)

abort_request

abort_request(
    request_id: Union[str, Iterable[str]],
) -> None

Aborts a request(s) with the given ID.

Parameters:

Name Type Description Default
request_id Union[str, Iterable[str]]

The ID(s) of the request to abort.

required
Details
Example

initialize engine and add a request with request_id

request_id = str(0)

abort the request

engine.abort_request(request_id)

Source code in vllm/engine/llm_engine.py
def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
    """Aborts a request(s) with the given ID.

    Args:
        request_id: The ID(s) of the request to abort.

    Details:
        - Refer to [vllm.core.scheduler.Scheduler.abort_seq_group][].

    Example:
        >>> # initialize engine and add a request with request_id
        >>> request_id = str(0)
        >>> # abort the request
        >>> engine.abort_request(request_id)
    """
    for scheduler in self.scheduler:
        scheduler.abort_seq_group(
            request_id, seq_id_to_seq_group=self.seq_id_to_seq_group)

add_logger

add_logger(
    logger_name: str, logger: StatLoggerBase
) -> None
Source code in vllm/engine/llm_engine.py
def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
    if not self.log_stats:
        raise RuntimeError(
            "Stat logging is disabled. Set `disable_log_stats=False` "
            "argument to enable.")
    if logger_name in self.stat_loggers:
        raise KeyError(f"Logger with name {logger_name} already exists.")
    self.stat_loggers[logger_name] = logger

add_lora

add_lora(lora_request: LoRARequest) -> bool
Source code in vllm/engine/llm_engine.py
def add_lora(self, lora_request: LoRARequest) -> bool:
    return self.model_executor.add_lora(lora_request)

add_request

add_request(
    request_id: str,
    prompt: PromptType,
    params: Union[SamplingParams, PoolingParams],
    arrival_time: Optional[float] = None,
    lora_request: Optional[LoRARequest] = None,
    tokenization_kwargs: Optional[dict[str, Any]] = None,
    trace_headers: Optional[Mapping[str, str]] = None,
    priority: int = 0,
) -> None

Add a request to the engine's request pool.

The request is added to the request pool and will be processed by the scheduler as engine.step() is called. The exact scheduling policy is determined by the scheduler.

Parameters:

Name Type Description Default
request_id str

The unique ID of the request.

required
prompt PromptType

The prompt to the LLM. See PromptType for more details about the format of each input.

required
params Union[SamplingParams, PoolingParams]

Parameters for sampling or pooling. SamplingParams for text generation. PoolingParams for pooling.

required
arrival_time Optional[float]

The arrival time of the request. If None, we use the current monotonic time.

None
lora_request Optional[LoRARequest]

The LoRA request to add.

None
trace_headers Optional[Mapping[str, str]]

OpenTelemetry trace headers.

None
priority int

The priority of the request. Only applicable with priority scheduling.

0
Details
  • Set arrival_time to the current time if it is None.
  • Set prompt_token_ids to the encoded prompt if it is None.
  • Create n number of [Sequence][vllm.Sequence] objects.
  • Create a [SequenceGroup][vllm.SequenceGroup] object from the list of [Sequence][vllm.Sequence].
  • Add the [SequenceGroup][vllm.SequenceGroup] object to the scheduler.
Example

initialize engine

engine = LLMEngine.from_engine_args(engine_args)

set request arguments

example_prompt = "Who is the president of the United States?" sampling_params = SamplingParams(temperature=0.0) request_id = 0

add the request to the engine

engine.add_request( str(request_id), example_prompt, SamplingParams(temperature=0.0))

continue the request processing

...

Source code in vllm/engine/llm_engine.py
def add_request(
    self,
    request_id: str,
    prompt: PromptType,
    params: Union[SamplingParams, PoolingParams],
    arrival_time: Optional[float] = None,
    lora_request: Optional[LoRARequest] = None,
    tokenization_kwargs: Optional[dict[str, Any]] = None,
    trace_headers: Optional[Mapping[str, str]] = None,
    priority: int = 0,
) -> None:
    """Add a request to the engine's request pool.

    The request is added to the request pool and will be processed by the
    scheduler as `engine.step()` is called. The exact scheduling policy is
    determined by the scheduler.

    Args:
        request_id: The unique ID of the request.
        prompt: The prompt to the LLM. See
            [PromptType][vllm.inputs.PromptType]
            for more details about the format of each input.
        params: Parameters for sampling or pooling.
            [SamplingParams][vllm.SamplingParams] for text generation.
            [PoolingParams][vllm.PoolingParams] for pooling.
        arrival_time: The arrival time of the request. If None, we use
            the current monotonic time.
        lora_request: The LoRA request to add.
        trace_headers: OpenTelemetry trace headers.
        priority: The priority of the request.
            Only applicable with priority scheduling.

    Details:
        - Set arrival_time to the current time if it is None.
        - Set prompt_token_ids to the encoded prompt if it is None.
        - Create `n` number of [Sequence][vllm.Sequence] objects.
        - Create a [SequenceGroup][vllm.SequenceGroup] object
          from the list of [Sequence][vllm.Sequence].
        - Add the [SequenceGroup][vllm.SequenceGroup] object to the
          scheduler.

    Example:
        >>> # initialize engine
        >>> engine = LLMEngine.from_engine_args(engine_args)
        >>> # set request arguments
        >>> example_prompt = "Who is the president of the United States?"
        >>> sampling_params = SamplingParams(temperature=0.0)
        >>> request_id = 0
        >>>
        >>> # add the request to the engine
        >>> engine.add_request(
        >>>    str(request_id),
        >>>    example_prompt,
        >>>    SamplingParams(temperature=0.0))
        >>> # continue the request processing
        >>> ...
    """
    if not isinstance(request_id, str):
        raise TypeError(
            f"request_id must be a string, got {type(request_id)}")

    if lora_request is not None and not self.lora_config:
        raise ValueError(f"Got lora_request {lora_request} but LoRA is "
                         "not enabled!")

    if priority != 0 and not self.scheduler_config.policy == "priority":
        raise ValueError(f"Got priority {priority} but "
                         "Priority scheduling is not enabled.")

    if isinstance(params, SamplingParams) \
        and params.logits_processors:
        raise ValueError(
            "Logits processors are not supported in multi-step decoding")

    if arrival_time is None:
        arrival_time = time.time()

    if (isinstance(prompt, dict)
            and prompt.get("prompt_embeds", None) is not None
            and not prompt.get("prompt_token_ids", None)):
        seq_len = prompt["prompt_embeds"].shape[0]
        prompt["prompt_token_ids"] = [0] * seq_len

    processed_inputs = self.input_preprocessor.preprocess(
        prompt,
        tokenization_kwargs=tokenization_kwargs,
        lora_request=lora_request,
    )

    self._add_processed_request(
        request_id=request_id,
        processed_inputs=processed_inputs,
        params=params,
        arrival_time=arrival_time,
        lora_request=lora_request,
        trace_headers=trace_headers,
        priority=priority,
    )

check_health

check_health() -> None
Source code in vllm/engine/llm_engine.py
def check_health(self) -> None:
    self.model_executor.check_health()

collective_rpc

collective_rpc(
    method: Union[str, Callable[..., _R]],
    timeout: Optional[float] = None,
    args: tuple = (),
    kwargs: Optional[dict[str, Any]] = None,
) -> list[_R]
Source code in vllm/engine/llm_engine.py
def collective_rpc(self,
                   method: Union[str, Callable[..., _R]],
                   timeout: Optional[float] = None,
                   args: tuple = (),
                   kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
    return self.model_executor.collective_rpc(method, timeout, args,
                                              kwargs)

create_trace_span

create_trace_span(seq_group: SequenceGroup) -> None
Source code in vllm/engine/llm_engine.py
def create_trace_span(self, seq_group: SequenceGroup) -> None:
    if self.tracer is None or seq_group.sampling_params is None:
        return
    arrival_time_nano_seconds = int(seq_group.metrics.arrival_time * 1e9)

    trace_context = extract_trace_context(seq_group.trace_headers)

    with self.tracer.start_as_current_span(
            "llm_request",
            kind=SpanKind.SERVER,
            context=trace_context,
            start_time=arrival_time_nano_seconds) as seq_span:
        metrics = seq_group.metrics

        # Handle potential None values for cancelled/aborted requests
        ttft = (metrics.first_token_time - metrics.arrival_time
                if metrics.first_token_time is not None else None)

        e2e_time = (metrics.finished_time - metrics.arrival_time
                    if metrics.finished_time is not None else None)

        seq_span.set_attribute(SpanAttributes.GEN_AI_RESPONSE_MODEL,
                               self.model_config.model)
        seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID,
                               seq_group.request_id)
        seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE,
                               seq_group.sampling_params.temperature)
        seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P,
                               seq_group.sampling_params.top_p)
        seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS,
                               seq_group.sampling_params.max_tokens)
        seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N,
                               seq_group.sampling_params.n)
        seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_NUM_SEQUENCES,
                               seq_group.num_seqs())
        seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS,
                               len(seq_group.prompt_token_ids))
        seq_span.set_attribute(
            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
            sum([
                seq.get_output_len()
                for seq in seq_group.get_finished_seqs()
            ]))

        # Only set timing attributes if the values are available
        if metrics.time_in_queue is not None:
            seq_span.set_attribute(
                SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
                metrics.time_in_queue)
        if ttft is not None:
            seq_span.set_attribute(
                SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, ttft)
        if e2e_time is not None:
            seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E,
                                   e2e_time)
        if metrics.scheduler_time is not None:
            seq_span.set_attribute(
                SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER,
                metrics.scheduler_time)
        if metrics.model_forward_time is not None:
            seq_span.set_attribute(
                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD,
                metrics.model_forward_time / 1000.0)
        if metrics.model_execute_time is not None:
            seq_span.set_attribute(
                SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE,
                metrics.model_execute_time)

do_log_stats

do_log_stats(
    scheduler_outputs: Optional[SchedulerOutputs] = None,
    model_output: Optional[List[SamplerOutput]] = None,
    finished_before: Optional[List[int]] = None,
    skip: Optional[List[int]] = None,
) -> None

Forced log when no requests active.

Source code in vllm/engine/llm_engine.py
def do_log_stats(self,
                 scheduler_outputs: Optional[SchedulerOutputs] = None,
                 model_output: Optional[List[SamplerOutput]] = None,
                 finished_before: Optional[List[int]] = None,
                 skip: Optional[List[int]] = None) -> None:
    """Forced log when no requests active."""
    if self.log_stats:
        stats = self._get_stats(scheduler_outputs, model_output,
                                finished_before, skip)
        for logger in self.stat_loggers.values():
            logger.log(stats)

do_tracing

do_tracing(
    scheduler_outputs: SchedulerOutputs,
    finished_before: Optional[List[int]] = None,
) -> None
Source code in vllm/engine/llm_engine.py
def do_tracing(self,
               scheduler_outputs: SchedulerOutputs,
               finished_before: Optional[List[int]] = None) -> None:
    if self.tracer is None:
        return

    for idx, scheduled_seq_group in enumerate(
            scheduler_outputs.scheduled_seq_groups):
        # Skip double tracing when using async output proc
        if finished_before and idx in finished_before:
            continue

        seq_group = scheduled_seq_group.seq_group
        if seq_group.is_finished():
            self.create_trace_span(seq_group)

enable_output_validation classmethod

enable_output_validation()
Source code in vllm/engine/llm_engine.py
@classmethod
@contextmanager
def enable_output_validation(cls):
    cls.DO_VALIDATE_OUTPUT = True

    yield

    cls.DO_VALIDATE_OUTPUT = False

from_engine_args classmethod

from_engine_args(
    engine_args: EngineArgs,
    usage_context: UsageContext = ENGINE_CONTEXT,
    stat_loggers: Optional[
        Dict[str, StatLoggerBase]
    ] = None,
) -> LLMEngine

Creates an LLM engine from the engine arguments.

Source code in vllm/engine/llm_engine.py
@classmethod
def from_engine_args(
    cls,
    engine_args: EngineArgs,
    usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
    stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
) -> "LLMEngine":
    """Creates an LLM engine from the engine arguments."""
    # Create the engine configs.
    vllm_config = engine_args.create_engine_config(usage_context)

    engine_cls = cls
    if envs.VLLM_USE_V1:
        from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
        engine_cls = V1LLMEngine

    return engine_cls.from_vllm_config(
        vllm_config=vllm_config,
        usage_context=usage_context,
        stat_loggers=stat_loggers,
        disable_log_stats=engine_args.disable_log_stats,
    )

from_vllm_config classmethod

from_vllm_config(
    vllm_config: VllmConfig,
    usage_context: UsageContext = ENGINE_CONTEXT,
    stat_loggers: Optional[
        Dict[str, StatLoggerBase]
    ] = None,
    disable_log_stats: bool = False,
) -> LLMEngine
Source code in vllm/engine/llm_engine.py
@classmethod
def from_vllm_config(
    cls,
    vllm_config: VllmConfig,
    usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
    stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
    disable_log_stats: bool = False,
) -> "LLMEngine":
    return cls(
        vllm_config=vllm_config,
        executor_class=cls._get_executor_cls(vllm_config),
        log_stats=(not disable_log_stats),
        usage_context=usage_context,
        stat_loggers=stat_loggers,
    )

get_decoding_config

get_decoding_config() -> DecodingConfig

Gets the decoding configuration.

Source code in vllm/engine/llm_engine.py
def get_decoding_config(self) -> DecodingConfig:
    """Gets the decoding configuration."""
    return self.decoding_config

get_lora_config

get_lora_config() -> LoRAConfig

Gets the LoRA configuration.

Source code in vllm/engine/llm_engine.py
def get_lora_config(self) -> LoRAConfig:
    """Gets the LoRA configuration."""
    return self.lora_config

get_model_config

get_model_config() -> ModelConfig

Gets the model configuration.

Source code in vllm/engine/llm_engine.py
def get_model_config(self) -> ModelConfig:
    """Gets the model configuration."""
    return self.model_config

get_num_unfinished_requests

get_num_unfinished_requests() -> int

Gets the number of unfinished requests.

Source code in vllm/engine/llm_engine.py
def get_num_unfinished_requests(self) -> int:
    """Gets the number of unfinished requests."""
    return sum(scheduler.get_num_unfinished_seq_groups()
               for scheduler in self.scheduler)

get_parallel_config

get_parallel_config() -> ParallelConfig

Gets the parallel configuration.

Source code in vllm/engine/llm_engine.py
def get_parallel_config(self) -> ParallelConfig:
    """Gets the parallel configuration."""
    return self.parallel_config

get_scheduler_config

get_scheduler_config() -> SchedulerConfig

Gets the scheduler configuration.

Source code in vllm/engine/llm_engine.py
def get_scheduler_config(self) -> SchedulerConfig:
    """Gets the scheduler configuration."""
    return self.scheduler_config

get_tokenizer

get_tokenizer(
    lora_request: Optional[LoRARequest] = None,
) -> AnyTokenizer
Source code in vllm/engine/llm_engine.py
def get_tokenizer(
    self,
    lora_request: Optional[LoRARequest] = None,
) -> AnyTokenizer:
    return self.get_tokenizer_group().get_lora_tokenizer(lora_request)

get_tokenizer_group

get_tokenizer_group() -> TokenizerGroup
Source code in vllm/engine/llm_engine.py
def get_tokenizer_group(self) -> TokenizerGroup:
    if self.tokenizer is None:
        raise ValueError("Unable to get tokenizer because "
                         "skip_tokenizer_init is True")

    return self.tokenizer

get_vllm_config

get_vllm_config() -> VllmConfig

Gets the vllm configuration.

Source code in vllm/engine/llm_engine.py
def get_vllm_config(self) -> VllmConfig:
    """Gets the vllm configuration."""
    return self.vllm_config

has_unfinished_requests

has_unfinished_requests() -> bool

Returns True if there are unfinished requests.

Source code in vllm/engine/llm_engine.py
def has_unfinished_requests(self) -> bool:
    """Returns True if there are unfinished requests."""
    return any(scheduler.has_unfinished_seqs()
               for scheduler in self.scheduler)

has_unfinished_requests_for_virtual_engine

has_unfinished_requests_for_virtual_engine(
    virtual_engine: int,
) -> bool

Returns True if there are unfinished requests for the virtual engine.

Source code in vllm/engine/llm_engine.py
def has_unfinished_requests_for_virtual_engine(
        self, virtual_engine: int) -> bool:
    """
    Returns True if there are unfinished requests for the virtual engine.
    """
    return self.scheduler[virtual_engine].has_unfinished_seqs()

is_sleeping

is_sleeping() -> bool
Source code in vllm/engine/llm_engine.py
def is_sleeping(self) -> bool:
    return self.model_executor.is_sleeping

is_tracing_enabled

is_tracing_enabled() -> bool
Source code in vllm/engine/llm_engine.py
def is_tracing_enabled(self) -> bool:
    return self.tracer is not None

list_loras

list_loras() -> Set[int]
Source code in vllm/engine/llm_engine.py
def list_loras(self) -> Set[int]:
    return self.model_executor.list_loras()

pin_lora

pin_lora(lora_id: int) -> bool
Source code in vllm/engine/llm_engine.py
def pin_lora(self, lora_id: int) -> bool:
    return self.model_executor.pin_lora(lora_id)

remove_logger

remove_logger(logger_name: str) -> None
Source code in vllm/engine/llm_engine.py
def remove_logger(self, logger_name: str) -> None:
    if not self.log_stats:
        raise RuntimeError(
            "Stat logging is disabled. Set `disable_log_stats=False` "
            "argument to enable.")
    if logger_name not in self.stat_loggers:
        raise KeyError(f"Logger with name {logger_name} does not exist.")
    del self.stat_loggers[logger_name]

remove_lora

remove_lora(lora_id: int) -> bool
Source code in vllm/engine/llm_engine.py
def remove_lora(self, lora_id: int) -> bool:
    return self.model_executor.remove_lora(lora_id)

reset_mm_cache

reset_mm_cache() -> bool

Reset the multi-modal cache.

Source code in vllm/engine/llm_engine.py
def reset_mm_cache(self) -> bool:
    """Reset the multi-modal cache."""
    return self.input_preprocessor.mm_registry.reset_processor_cache(
        self.model_config)

reset_prefix_cache

reset_prefix_cache(device: Optional[Device] = None) -> bool

Reset prefix cache for all devices.

Source code in vllm/engine/llm_engine.py
def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
    """Reset prefix cache for all devices."""

    success = True
    for scheduler in self.scheduler:
        success = success and scheduler.reset_prefix_cache(device)
    return success

sleep

sleep(level: int = 1) -> None
Source code in vllm/engine/llm_engine.py
def sleep(self, level: int = 1) -> None:
    assert self.vllm_config.model_config.enable_sleep_mode, (
        "Sleep mode is not enabled in the model config")
    self.model_executor.sleep(level=level)

start_profile

start_profile() -> None
Source code in vllm/engine/llm_engine.py
def start_profile(self) -> None:
    self.model_executor.start_profile()

step

Performs one decoding iteration and returns newly generated results.

Overview of the step function
Overview of the step function

Details: - Step 1: Schedules the sequences to be executed in the next iteration and the token blocks to be swapped in/out/copy.

- Depending on the scheduling policy,
    sequences may be `preempted/reordered`.
- A Sequence Group (SG) refer to a group of sequences
    that are generated from the same prompt.
  • Step 2: Calls the distributed executor to execute the model.
  • Step 3: Processes the model output. This mainly includes:

    • Decodes the relevant outputs.
    • Updates the scheduled sequence groups with model outputs based on its sampling parameters (use_beam_search or not).
    • Frees the finished sequence groups.
  • Finally, it creates and returns the newly generated results.

Example:

# Please see the example/ folder for more detailed examples.

# initialize engine and request arguments
engine = LLMEngine.from_engine_args(engine_args)
example_inputs = [(0, "What is LLM?",
SamplingParams(temperature=0.0))]

# Start the engine with an event loop
while True:
    if example_inputs:
        req_id, prompt, sampling_params = example_inputs.pop(0)
        engine.add_request(str(req_id),prompt,sampling_params)

    # continue the request processing
    request_outputs = engine.step()
    for request_output in request_outputs:
        if request_output.finished:
            # return or show the request output

    if not (engine.has_unfinished_requests() or example_inputs):
        break

Source code in vllm/engine/llm_engine.py
def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
    """Performs one decoding iteration and returns newly generated results.

    <figure markdown="span">
    ![Overview of the step function](https://i.imgur.com/sv2HssD.png)
    <figcaption>Overview of the step function</figcaption>
    </figure>

    Details:
    - Step 1: Schedules the sequences to be executed in the next
        iteration and the token blocks to be swapped in/out/copy.

        - Depending on the scheduling policy,
            sequences may be `preempted/reordered`.
        - A Sequence Group (SG) refer to a group of sequences
            that are generated from the same prompt.

    - Step 2: Calls the distributed executor to execute the model.
    - Step 3: Processes the model output. This mainly includes:

        - Decodes the relevant outputs.
        - Updates the scheduled sequence groups with model outputs
            based on its `sampling parameters` (`use_beam_search` or not).
        - Frees the finished sequence groups.

    - Finally, it creates and returns the newly generated results.

    Example:
    ```
    # Please see the example/ folder for more detailed examples.

    # initialize engine and request arguments
    engine = LLMEngine.from_engine_args(engine_args)
    example_inputs = [(0, "What is LLM?",
    SamplingParams(temperature=0.0))]

    # Start the engine with an event loop
    while True:
        if example_inputs:
            req_id, prompt, sampling_params = example_inputs.pop(0)
            engine.add_request(str(req_id),prompt,sampling_params)

        # continue the request processing
        request_outputs = engine.step()
        for request_output in request_outputs:
            if request_output.finished:
                # return or show the request output

        if not (engine.has_unfinished_requests() or example_inputs):
            break
    ```
    """
    if self.parallel_config.pipeline_parallel_size > 1:
        raise NotImplementedError(
            "Pipeline parallelism is only supported through AsyncLLMEngine "
            "as performance will be severely degraded otherwise.")

    # For llm_engine, there is no pipeline parallel support, so the engine
    # used is always 0.
    virtual_engine = 0

    # These are cached outputs from previous iterations. None if on first
    # iteration
    cached_outputs = self.cached_scheduler_outputs[virtual_engine]
    seq_group_metadata_list = cached_outputs.seq_group_metadata_list
    scheduler_outputs = cached_outputs.scheduler_outputs
    allow_async_output_proc = cached_outputs.allow_async_output_proc

    ctx = self.scheduler_contexts[virtual_engine]

    # Clear outputs for each new scheduler iteration
    ctx.request_outputs.clear()

    # Skip the scheduler if there are any remaining steps in the seq groups.
    # This ensures that the scheduler is only called again when the current
    # batch has completed.
    # The scheduler is also skipped if a single request caused the last
    # engine step to fail, and the previous schedule needs to be rerun.
    if not self._has_remaining_steps(
            seq_group_metadata_list
    ) and not self._skip_scheduling_next_step:
        # Schedule iteration
        (seq_group_metadata_list, scheduler_outputs,
         allow_async_output_proc
         ) = self.scheduler[virtual_engine].schedule()

        ctx.seq_group_metadata_list = seq_group_metadata_list
        ctx.scheduler_outputs = scheduler_outputs

        finished_requests_ids = self.scheduler[
            virtual_engine].get_and_reset_finished_requests_ids()
        # When n>1, elements in self.seq_id_to_seq_group should be deleted
        # here, otherwise memory leaks.
        for finished_request_id in finished_requests_ids:
            if finished_request_id in self.seq_id_to_seq_group:
                del self.seq_id_to_seq_group[finished_request_id]

        # Maybe switch from async mode to sync mode
        if not allow_async_output_proc and len(ctx.output_queue) > 0:
            self._process_model_outputs(ctx=ctx)

    else:
        finished_requests_ids = list()

    assert seq_group_metadata_list is not None
    assert scheduler_outputs is not None

    if not scheduler_outputs.is_empty():

        # Check if we have a cached last_output from the previous iteration.
        # For supporting PP this is probably the best way to pass the
        # sampled_token_ids, as a separate broadcast over all the PP stages
        # will cause one virtual engine's microbatch to block the pipeline.
        last_sampled_token_ids = \
            self._get_last_sampled_token_ids(virtual_engine)

        execute_model_req = ExecuteModelRequest(
            seq_group_metadata_list=seq_group_metadata_list,
            blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
            blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
            blocks_to_copy=scheduler_outputs.blocks_to_copy,
            num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
            running_queue_size=scheduler_outputs.running_queue_size,
            finished_requests_ids=finished_requests_ids,
            # We use ExecuteModelRequest to pass the last sampled_token_ids
            # to each of the non-last PP stages for in-place prepare_input.
            last_sampled_token_ids=last_sampled_token_ids)

        if allow_async_output_proc:
            execute_model_req.async_callback = self.async_callbacks[
                virtual_engine]

        try:
            outputs = self.model_executor.execute_model(
                execute_model_req=execute_model_req)
            self._skip_scheduling_next_step = False
        except InputProcessingError as e:
            # The input for this request cannot be processed, so we must
            # abort it. If there are remaining requests in the batch that
            # have been scheduled, they will be retried on the next step.
            invalid_request_id = e.request_id
            self._abort_and_cache_schedule(
                request_id=invalid_request_id,
                virtual_engine=virtual_engine,
                seq_group_metadata_list=seq_group_metadata_list,
                scheduler_outputs=scheduler_outputs,
                allow_async_output_proc=allow_async_output_proc)
            # Raise so the caller is notified that this request failed
            raise

    else:
        # Nothing scheduled => If there is pending async postprocessor,
        # then finish it here.
        if len(ctx.output_queue) > 0:
            self._process_model_outputs(ctx=ctx)
        # No outputs in this case
        outputs = []

    if not self._has_remaining_steps(seq_group_metadata_list):
        # is_first_step_output is True only when the num_steps of all
        # the sequences are 1.
        is_first_step_output: bool = False if not seq_group_metadata_list \
            else seq_group_metadata_list[0].state.num_steps == 1

        # Add results to the output_queue
        ctx.append_output(outputs=outputs,
                          seq_group_metadata_list=seq_group_metadata_list,
                          scheduler_outputs=scheduler_outputs,
                          is_async=allow_async_output_proc,
                          is_last_step=True,
                          is_first_step_output=is_first_step_output)

        if outputs and allow_async_output_proc:
            assert len(outputs) == 1, (
                "Async postprocessor expects only a single output set")

            self._advance_to_next_step(
                outputs[0], seq_group_metadata_list,
                scheduler_outputs.scheduled_seq_groups)

        # Check if need to run the usual non-async path
        if not allow_async_output_proc:
            self._process_model_outputs(ctx=ctx)

            # Log stats.
            self.do_log_stats(scheduler_outputs, outputs)

            # Tracing
            self.do_tracing(scheduler_outputs)
    else:
        # Multi-step case
        return ctx.request_outputs

    if not self.has_unfinished_requests():
        # Drain async postprocessor (if exists)
        if len(ctx.output_queue) > 0:
            self._process_model_outputs(ctx=ctx)
        assert len(ctx.output_queue) == 0

        # Stop the execute model loop in parallel workers until there are
        # more requests to process. This avoids waiting indefinitely in
        # torch.distributed ops which may otherwise timeout, and unblocks
        # the RPC thread in the workers so that they can process any other
        # queued control plane messages, such as add/remove lora adapters.
        logger.debug("Stopping remote worker execution loop.")
        self.model_executor.stop_remote_worker_execution_loop()

    return ctx.request_outputs

stop_profile

stop_profile() -> None
Source code in vllm/engine/llm_engine.py
def stop_profile(self) -> None:
    self.model_executor.stop_profile()

stop_remote_worker_execution_loop

stop_remote_worker_execution_loop() -> None
Source code in vllm/engine/llm_engine.py
def stop_remote_worker_execution_loop(self) -> None:
    self.model_executor.stop_remote_worker_execution_loop()

validate_output classmethod

validate_output(
    output: object, output_type: Type[_O]
) -> _O
Source code in vllm/engine/llm_engine.py
@classmethod
def validate_output(
    cls,
    output: object,
    output_type: Type[_O],
) -> _O:
    do_validate = cls.DO_VALIDATE_OUTPUT

    if ((TYPE_CHECKING or do_validate)
            and not isinstance(output, output_type)):
        raise TypeError(f"Expected output of type {output_type}, "
                        f"but found type {type(output)}")

    return cast(_O, output)

validate_outputs classmethod

validate_outputs(
    outputs: Sequence[object], output_type: Type[_O]
) -> List[_O]
Source code in vllm/engine/llm_engine.py
@classmethod
def validate_outputs(
    cls,
    outputs: GenericSequence[object],
    output_type: Type[_O],
) -> List[_O]:
    do_validate = cls.DO_VALIDATE_OUTPUT

    outputs_: List[_O]
    if TYPE_CHECKING or do_validate:
        outputs_ = []
        for output in outputs:
            if not isinstance(output, output_type):
                raise TypeError(f"Expected output of type {output_type}, "
                                f"but found type {type(output)}")

            outputs_.append(output)
    else:
        outputs_ = outputs

    return outputs_

wake_up

wake_up(tags: Optional[list[str]] = None) -> None
Source code in vllm/engine/llm_engine.py
def wake_up(self, tags: Optional[list[str]] = None) -> None:
    assert self.vllm_config.model_config.enable_sleep_mode, (
        "Sleep mode is not enabled in the model config")
    self.model_executor.wake_up(tags)

PoolingOutput dataclass

The output data of one pooling output of a request.

Parameters:

Name Type Description Default
data Tensor

The extracted hidden states.

required
Source code in vllm/outputs.py
@dataclass
class PoolingOutput:
    """The output data of one pooling output of a request.

    Args:
        data: The extracted hidden states.
    """
    data: torch.Tensor

    def __repr__(self) -> str:
        return (f"PoolingOutput(data={self.data})")

    def __eq__(self, other: object) -> bool:
        return (isinstance(other, self.__class__) and bool(
            (self.data == other.data).all()))

data instance-attribute

data: Tensor

__eq__

__eq__(other: object) -> bool
Source code in vllm/outputs.py
def __eq__(self, other: object) -> bool:
    return (isinstance(other, self.__class__) and bool(
        (self.data == other.data).all()))

__init__

__init__(data: Tensor) -> None

__repr__

__repr__() -> str
Source code in vllm/outputs.py
def __repr__(self) -> str:
    return (f"PoolingOutput(data={self.data})")

PoolingParams

Bases: Struct

API parameters for pooling models.

Attributes:

Name Type Description
normalize Optional[bool]

Whether to normalize the embeddings outputs.

dimensions Optional[int]

Reduce the dimensions of embeddings if model support matryoshka representation.

activation Optional[bool]

Whether to apply activation function to the classification outputs.

softmax Optional[bool]

Whether to apply softmax to the reward outputs.

Source code in vllm/pooling_params.py
class PoolingParams(
        msgspec.Struct,
        omit_defaults=True,  # type: ignore[call-arg]
        array_like=True):  # type: ignore[call-arg]
    """API parameters for pooling models.

    Attributes:
        normalize: Whether to normalize the embeddings outputs.
        dimensions: Reduce the dimensions of embeddings
                    if model support matryoshka representation.
        activation: Whether to apply activation function to
                    the classification outputs.
        softmax: Whether to apply softmax to the reward outputs.
    """

    ## for embeddings models
    dimensions: Optional[int] = None
    normalize: Optional[bool] = None

    ## for classification models
    activation: Optional[bool] = None

    ## for reward models
    softmax: Optional[bool] = None
    step_tag_id: Optional[int] = None
    returned_token_ids: Optional[list[int]] = None

    task: Optional[PoolingTask] = None
    """Internal use only."""

    requires_token_ids: bool = False
    """Internal use only."""

    extra_kwargs: Optional[dict[str, Any]] = None
    """Internal use only."""

    output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY

    @property
    def all_parameters(self) -> list[str]:
        return [
            "dimensions", "normalize", "activation", "softmax", "step_tag_id",
            "returned_token_ids"
        ]

    @property
    def valid_parameters(self):
        return {
            "embed": ["dimensions", "normalize"],
            "classify": ["activation"],
            "score": ["activation"],
            "encode": ["softmax", "step_tag_id", "returned_token_ids"],
        }

    def clone(self) -> "PoolingParams":
        """Returns a deep copy of the PoolingParams instance."""
        return deepcopy(self)

    def verify(self,
               task: PoolingTask,
               model_config: Optional["ModelConfig"] = None) -> None:

        if self.task is None:
            self.task = task
        elif self.task != task:
            msg = f"You cannot overwrite {self.task=!r} with {task=!r}!"
            raise ValueError(msg)

        # NOTE: Task validation needs to done against the model instance,
        # which is not available in model config. So, it's not included
        # in this method

        self._merge_default_parameters(model_config)
        self._set_default_parameters(model_config)
        self._verify_valid_parameters()

    def _merge_default_parameters(self,
                                  model_config: Optional["ModelConfig"] = None
                                  ) -> None:

        if model_config is None:
            return

        pooler_config = model_config.pooler_config
        if pooler_config is None:
            return

        assert self.task is not None, "task must be set"
        valid_parameters = self.valid_parameters[self.task]

        for k in valid_parameters:
            if getattr(pooler_config, k, None) is None:
                continue

            if getattr(self, k, None) is None:
                setattr(self, k, getattr(pooler_config, k))

    def _set_default_parameters(self, model_config: Optional["ModelConfig"]):
        if self.task == "embed":
            if self.normalize is None:
                self.normalize = True

            if self.dimensions is not None and model_config is not None:
                if not model_config.is_matryoshka:
                    raise ValueError(
                        f'Model "{model_config.served_model_name}" does not '
                        f'support matryoshka representation, '
                        f'changing output dimensions will lead to poor results.'
                    )

                mds = model_config.matryoshka_dimensions
                if mds is not None:
                    if self.dimensions not in mds:
                        raise ValueError(
                            f'Model "{model_config.served_model_name}" '
                            f'only supports {str(mds)} matryoshka dimensions, '
                            f'use other output dimensions will '
                            f'lead to poor results.')
                elif self.dimensions < 1:
                    raise ValueError("Dimensions must be greater than 0")

        elif self.task in ["classify", "score"]:
            if self.activation is None:
                self.activation = True

        elif self.task == "encode":
            if self.softmax is None:
                self.softmax = True
        else:
            raise ValueError(f"Unknown pooling task: {self.task}")

    def _verify_valid_parameters(self):
        assert self.task is not None, "task must be set"
        valid_parameters = self.valid_parameters[self.task]
        invalid_parameters = []
        for k in self.all_parameters:
            if k in valid_parameters:
                continue

            if getattr(self, k, None) is not None:
                invalid_parameters.append(k)

        if invalid_parameters:
            raise ValueError(
                f"Task {self.task} only supports {valid_parameters} "
                f"parameters, does not support "
                f"{invalid_parameters} parameters")

    def __repr__(self) -> str:
        return (f"PoolingParams("
                f"task={self.task}, "
                f"normalize={self.normalize}, "
                f"dimensions={self.dimensions}, "
                f"activation={self.activation}, "
                f"softmax={self.softmax}, "
                f"step_tag_id={self.step_tag_id}, "
                f"returned_token_ids={self.returned_token_ids}, "
                f"requires_token_ids={self.requires_token_ids}, "
                f"extra_kwargs={self.extra_kwargs})")

    def __post_init__(self) -> None:
        assert self.output_kind == RequestOutputKind.FINAL_ONLY,\
            "For pooling output_kind has to be FINAL_ONLY"

activation class-attribute instance-attribute

activation: Optional[bool] = None

all_parameters property

all_parameters: list[str]

dimensions class-attribute instance-attribute

dimensions: Optional[int] = None

extra_kwargs class-attribute instance-attribute

extra_kwargs: Optional[dict[str, Any]] = None

Internal use only.

normalize class-attribute instance-attribute

normalize: Optional[bool] = None

output_kind class-attribute instance-attribute

requires_token_ids class-attribute instance-attribute

requires_token_ids: bool = False

Internal use only.

returned_token_ids class-attribute instance-attribute

returned_token_ids: Optional[list[int]] = None

softmax class-attribute instance-attribute

softmax: Optional[bool] = None

step_tag_id class-attribute instance-attribute

step_tag_id: Optional[int] = None

task class-attribute instance-attribute

task: Optional[PoolingTask] = None

Internal use only.

valid_parameters property

valid_parameters

__post_init__

__post_init__() -> None
Source code in vllm/pooling_params.py
def __post_init__(self) -> None:
    assert self.output_kind == RequestOutputKind.FINAL_ONLY,\
        "For pooling output_kind has to be FINAL_ONLY"

__repr__

__repr__() -> str
Source code in vllm/pooling_params.py
def __repr__(self) -> str:
    return (f"PoolingParams("
            f"task={self.task}, "
            f"normalize={self.normalize}, "
            f"dimensions={self.dimensions}, "
            f"activation={self.activation}, "
            f"softmax={self.softmax}, "
            f"step_tag_id={self.step_tag_id}, "
            f"returned_token_ids={self.returned_token_ids}, "
            f"requires_token_ids={self.requires_token_ids}, "
            f"extra_kwargs={self.extra_kwargs})")

_merge_default_parameters

_merge_default_parameters(
    model_config: Optional[ModelConfig] = None,
) -> None
Source code in vllm/pooling_params.py
def _merge_default_parameters(self,
                              model_config: Optional["ModelConfig"] = None
                              ) -> None:

    if model_config is None:
        return

    pooler_config = model_config.pooler_config
    if pooler_config is None:
        return

    assert self.task is not None, "task must be set"
    valid_parameters = self.valid_parameters[self.task]

    for k in valid_parameters:
        if getattr(pooler_config, k, None) is None:
            continue

        if getattr(self, k, None) is None:
            setattr(self, k, getattr(pooler_config, k))

_set_default_parameters

_set_default_parameters(
    model_config: Optional[ModelConfig],
)
Source code in vllm/pooling_params.py
def _set_default_parameters(self, model_config: Optional["ModelConfig"]):
    if self.task == "embed":
        if self.normalize is None:
            self.normalize = True

        if self.dimensions is not None and model_config is not None:
            if not model_config.is_matryoshka:
                raise ValueError(
                    f'Model "{model_config.served_model_name}" does not '
                    f'support matryoshka representation, '
                    f'changing output dimensions will lead to poor results.'
                )

            mds = model_config.matryoshka_dimensions
            if mds is not None:
                if self.dimensions not in mds:
                    raise ValueError(
                        f'Model "{model_config.served_model_name}" '
                        f'only supports {str(mds)} matryoshka dimensions, '
                        f'use other output dimensions will '
                        f'lead to poor results.')
            elif self.dimensions < 1:
                raise ValueError("Dimensions must be greater than 0")

    elif self.task in ["classify", "score"]:
        if self.activation is None:
            self.activation = True

    elif self.task == "encode":
        if self.softmax is None:
            self.softmax = True
    else:
        raise ValueError(f"Unknown pooling task: {self.task}")

_verify_valid_parameters

_verify_valid_parameters()
Source code in vllm/pooling_params.py
def _verify_valid_parameters(self):
    assert self.task is not None, "task must be set"
    valid_parameters = self.valid_parameters[self.task]
    invalid_parameters = []
    for k in self.all_parameters:
        if k in valid_parameters:
            continue

        if getattr(self, k, None) is not None:
            invalid_parameters.append(k)

    if invalid_parameters:
        raise ValueError(
            f"Task {self.task} only supports {valid_parameters} "
            f"parameters, does not support "
            f"{invalid_parameters} parameters")

clone

clone() -> PoolingParams

Returns a deep copy of the PoolingParams instance.

Source code in vllm/pooling_params.py
def clone(self) -> "PoolingParams":
    """Returns a deep copy of the PoolingParams instance."""
    return deepcopy(self)

verify

verify(
    task: PoolingTask,
    model_config: Optional[ModelConfig] = None,
) -> None
Source code in vllm/pooling_params.py
def verify(self,
           task: PoolingTask,
           model_config: Optional["ModelConfig"] = None) -> None:

    if self.task is None:
        self.task = task
    elif self.task != task:
        msg = f"You cannot overwrite {self.task=!r} with {task=!r}!"
        raise ValueError(msg)

    # NOTE: Task validation needs to done against the model instance,
    # which is not available in model config. So, it's not included
    # in this method

    self._merge_default_parameters(model_config)
    self._set_default_parameters(model_config)
    self._verify_valid_parameters()

PoolingRequestOutput

Bases: Generic[_O]

The output data of a pooling request to the LLM.

Parameters:

Name Type Description Default
request_id str

A unique identifier for the pooling request.

required
outputs PoolingOutput

The pooling results for the given input.

required
prompt_token_ids list[int]

A list of token IDs used in the prompt.

required
finished bool

A flag indicating whether the pooling is completed.

required
Source code in vllm/outputs.py
class PoolingRequestOutput(Generic[_O]):
    """
    The output data of a pooling request to the LLM.

    Args:
        request_id (str): A unique identifier for the pooling request.
        outputs (PoolingOutput): The pooling results for the given input.
        prompt_token_ids (list[int]): A list of token IDs used in the prompt.
        finished (bool): A flag indicating whether the pooling is completed.
    """

    def __init__(self, request_id: str, outputs: _O,
                 prompt_token_ids: list[int], finished: bool):
        self.request_id = request_id
        self.prompt_token_ids = prompt_token_ids
        self.finished = finished
        self.outputs = outputs

    @staticmethod
    def from_seq_group(seq_group: SequenceGroup) -> "PoolingRequestOutput":
        pooled_data = seq_group.pooled_data
        assert pooled_data is not None

        data = pooled_data.to(dtype=torch.float32, device="cpu")
        output = PoolingOutput(data)
        prompt_token_ids = seq_group.prompt_token_ids
        finished = seq_group.is_finished()

        return PoolingRequestOutput(seq_group.request_id, output,
                                    prompt_token_ids, finished)

    def __repr__(self):
        return (f"{type(self).__name__}(request_id={self.request_id!r}, "
                f"outputs={self.outputs!r}, "
                f"prompt_token_ids={self.prompt_token_ids}, "
                f"finished={self.finished})")

finished instance-attribute

finished = finished

outputs instance-attribute

outputs = outputs

prompt_token_ids instance-attribute

prompt_token_ids = prompt_token_ids

request_id instance-attribute

request_id = request_id

__init__

__init__(
    request_id: str,
    outputs: _O,
    prompt_token_ids: list[int],
    finished: bool,
)
Source code in vllm/outputs.py
def __init__(self, request_id: str, outputs: _O,
             prompt_token_ids: list[int], finished: bool):
    self.request_id = request_id
    self.prompt_token_ids = prompt_token_ids
    self.finished = finished
    self.outputs = outputs

__repr__

__repr__()
Source code in vllm/outputs.py
def __repr__(self):
    return (f"{type(self).__name__}(request_id={self.request_id!r}, "
            f"outputs={self.outputs!r}, "
            f"prompt_token_ids={self.prompt_token_ids}, "
            f"finished={self.finished})")

from_seq_group staticmethod

from_seq_group(
    seq_group: SequenceGroup,
) -> PoolingRequestOutput
Source code in vllm/outputs.py
@staticmethod
def from_seq_group(seq_group: SequenceGroup) -> "PoolingRequestOutput":
    pooled_data = seq_group.pooled_data
    assert pooled_data is not None

    data = pooled_data.to(dtype=torch.float32, device="cpu")
    output = PoolingOutput(data)
    prompt_token_ids = seq_group.prompt_token_ids
    finished = seq_group.is_finished()

    return PoolingRequestOutput(seq_group.request_id, output,
                                prompt_token_ids, finished)

RequestOutput

The output data of a completion request to the LLM.

Parameters:

Name Type Description Default
request_id str

The unique ID of the request.

required
prompt Optional[str]

The prompt string of the request. For encoder/decoder models, this is the decoder input prompt.

required
prompt_token_ids Optional[list[int]]

The token IDs of the prompt. For encoder/decoder models, this is the decoder input prompt token ids.

required
prompt_logprobs Optional[PromptLogprobs]

The log probabilities to return per prompt token.

required
outputs list[CompletionOutput]

The output sequences of the request.

required
finished bool

Whether the whole request is finished.

required
metrics Optional[RequestMetrics]

Metrics associated with the request.

None
lora_request Optional[LoRARequest]

The LoRA request that was used to generate the output.

None
encoder_prompt Optional[str]

The encoder prompt string of the request. None if decoder-only.

None
encoder_prompt_token_ids Optional[list[int]]

The token IDs of the encoder prompt. None if decoder-only.

None
num_cached_tokens Optional[int]

The number of tokens with prefix cache hit.

None
kv_transfer_params Optional[dict[str, Any]]

The params for remote K/V transfer.

None
Source code in vllm/outputs.py
class RequestOutput:
    """The output data of a completion request to the LLM.

    Args:
        request_id: The unique ID of the request.
        prompt: The prompt string of the request.
                For encoder/decoder models, this is the
                decoder input prompt.
        prompt_token_ids: The token IDs of the prompt.
                          For encoder/decoder models, this is the
                          decoder input prompt token ids.
        prompt_logprobs: The log probabilities to return per prompt token.
        outputs: The output sequences of the request.
        finished: Whether the whole request is finished.
        metrics: Metrics associated with the request.
        lora_request: The LoRA request that was used to generate the output.
        encoder_prompt: The encoder prompt string of the request.
                        None if decoder-only.
        encoder_prompt_token_ids: The token IDs of the encoder prompt.
                                  None if decoder-only.
        num_cached_tokens: The number of tokens with prefix cache hit.
        kv_transfer_params: The params for remote K/V transfer.
    """

    def __init__(
        self,
        request_id: str,
        prompt: Optional[str],
        prompt_token_ids: Optional[list[int]],
        prompt_logprobs: Optional[PromptLogprobs],
        outputs: list[CompletionOutput],
        finished: bool,
        metrics: Optional[RequestMetrics] = None,
        lora_request: Optional[LoRARequest] = None,
        encoder_prompt: Optional[str] = None,
        encoder_prompt_token_ids: Optional[list[int]] = None,
        num_cached_tokens: Optional[int] = None,
        *,
        multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None,
        kv_transfer_params: Optional[dict[str, Any]] = None,
        # Forward compatibility, code that uses args added in new release can
        # still run with older versions of vLLM without breaking.
        **kwargs: Any,
    ) -> None:
        if kwargs:
            logger.warning_once("RequestOutput: Ignoring extra arguments: %s",
                                str(kwargs))
        self.request_id = request_id
        self.prompt = prompt
        self.prompt_token_ids = prompt_token_ids
        self.multi_modal_placeholders = multi_modal_placeholders or {}
        self.prompt_logprobs = prompt_logprobs
        self.outputs = outputs
        self.finished = finished
        self.metrics = metrics
        self.lora_request = lora_request
        self.encoder_prompt = encoder_prompt
        self.encoder_prompt_token_ids = encoder_prompt_token_ids
        self.num_cached_tokens = num_cached_tokens
        self.kv_transfer_params = kv_transfer_params

    def add(self, next_output: "RequestOutput", aggregate: bool) -> None:
        """Merge subsequent RequestOutput into this one"""

        self.finished |= next_output.finished
        self.kv_transfer_params = next_output.kv_transfer_params

        for next_completion in next_output.outputs:
            for i, completion in enumerate(self.outputs):
                if completion.index == next_completion.index:
                    if aggregate:
                        # Merge outputs with same index
                        completion.text += next_completion.text
                        if not isinstance(completion.token_ids,
                                          MutableSequence):
                            completion.token_ids = list(completion.token_ids)
                        completion.token_ids.extend(next_completion.token_ids)
                        if next_completion.logprobs:
                            assert completion.logprobs is not None
                            completion.logprobs.extend(
                                next_completion.logprobs)
                        completion.cumulative_logprob = (
                            next_completion.cumulative_logprob)
                        completion.finish_reason = next_completion.finish_reason
                        completion.stop_reason = next_completion.stop_reason
                    else:
                        # Replace the output with the new one
                        self.outputs[i] = next_completion
                    break
            else:
                self.outputs.append(next_completion)

    @classmethod
    def from_seq_group(
        cls, seq_group: SequenceGroup, use_cache: bool,
        seq_id_to_seq_group: dict[str, SequenceGroupBase]
    ) -> Optional["RequestOutput"]:
        finished = seq_group.is_finished()

        if seq_group.request_id in seq_id_to_seq_group:
            group: SequenceGroupBase = seq_id_to_seq_group[
                seq_group.request_id]
            assembled_seq_group = group.maybe_assemble_group(seq_group)
            if finished:
                group.finish_seq(seq_group)
            if assembled_seq_group is None:
                return None

            # clear finished seq in seq_id_to_seq_group
            if len(group.to_be_finished) == 0:
                for sub_request_id in list(group.seq_id_to_index.keys()):
                    if sub_request_id in seq_id_to_seq_group:
                        del seq_id_to_seq_group[sub_request_id]

            return cls.from_seq_group(assembled_seq_group, use_cache,
                                      seq_id_to_seq_group)

        sampling_params = seq_group.sampling_params
        if sampling_params is None:
            raise ValueError(
                "Sampling parameters are missing for a CompletionRequest.")

        if sampling_params.output_kind == RequestOutputKind.FINAL_ONLY and (
                not finished):
            return None

        # Init cache (if needed)
        if use_cache and seq_group.cached_request_output is None:
            seq_group.cached_request_output = RequestOutput(  # type: ignore
                request_id="",
                prompt=None,
                prompt_token_ids=[],
                prompt_logprobs=None,
                outputs=[],
                finished=False)

        top_n_seqs = seq_group.get_seqs()

        # Create the outputs.
        # NOTE: We need omit logprobs here explicitly because the sequence
        # always has the logprobs of the sampled tokens even if the
        # logprobs are not requested.
        include_logprobs = sampling_params.logprobs is not None
        text_buffer_length = sampling_params.output_text_buffer_length
        delta = sampling_params.output_kind == RequestOutputKind.DELTA

        outputs = []
        include_prompt = True
        # num_cached_tokens should be the same for all the sequences
        num_cached_tokens = None
        for i, seq in enumerate(top_n_seqs):
            output_text = seq.get_output_text_to_return(
                text_buffer_length, delta)

            output_token_ids = seq.get_output_token_ids_to_return(delta)
            num_output_tokens = 1 if isinstance(output_token_ids,
                                                int) else len(output_token_ids)
            num_cached_tokens = seq.data.get_num_cached_tokens()

            output_logprobs = seq.output_logprobs if include_logprobs else None

            if delta:
                # Slice logprobs delta if applicable
                if output_logprobs:
                    # num_output_tokens can be 0 when n > 1 and request finishes
                    # before the others
                    if num_output_tokens > 0:
                        output_logprobs = output_logprobs[-num_output_tokens:]
                    else:
                        output_logprobs = None
                # Don't include prompt if this is after the first output
                # containing decode token ids
                if include_prompt and seq.get_output_len() > num_output_tokens:
                    include_prompt = False

            if use_cache:
                # Get cached output object
                cached_outputs = seq_group.cached_request_output.outputs  # type: ignore
                if i >= len(cached_outputs):
                    cached_outputs.append(
                        CompletionOutput(index=i,
                                         text="",
                                         token_ids=[],
                                         cumulative_logprob=None,
                                         logprobs=None,
                                         finish_reason=None,
                                         stop_reason=None))
                output = cached_outputs[i]

                # Init cached output object
                assert output.index == i
                output.text = output_text

                if isinstance(output_token_ids, int):
                    output.token_ids.clear()
                    output.token_ids.append(output_token_ids)
                else:
                    output.token_ids = output_token_ids

                output.cumulative_logprob = seq.get_cumulative_logprob() \
                    if include_logprobs else None
                output.logprobs = output_logprobs
                output.finish_reason = SequenceStatus.get_finished_reason(
                    seq.status)
                output.stop_reason = seq.stop_reason

            else:
                output = CompletionOutput(
                    top_n_seqs.index(seq), output_text, [output_token_ids]
                    if isinstance(output_token_ids, int) else output_token_ids,
                    seq.get_cumulative_logprob() if include_logprobs else None,
                    output_logprobs,
                    SequenceStatus.get_finished_reason(seq.status),
                    seq.stop_reason)

            outputs.append(output)

        # Every sequence in the sequence group should have the same prompt.
        if include_prompt:
            prompt = seq_group.prompt
            prompt_token_ids = seq_group.prompt_token_ids
            encoder_prompt = seq_group.encoder_prompt
            encoder_prompt_token_ids = seq_group.encoder_prompt_token_ids
            prompt_logprobs = seq_group.prompt_logprobs
        else:
            prompt = None
            prompt_token_ids = None
            encoder_prompt = None
            encoder_prompt_token_ids = None
            prompt_logprobs = None
        finished_time = time.time() if finished else None
        seq_group.set_finished_time(finished_time)

        init_kwargs = {
            "request_id": seq_group.request_id,
            "prompt": prompt,
            "prompt_token_ids": prompt_token_ids,
            "prompt_logprobs": prompt_logprobs,
            "outputs": outputs,
            "finished": finished,
            "metrics": seq_group.metrics,
            "lora_request": seq_group.lora_request,
            "encoder_prompt": encoder_prompt,
            "encoder_prompt_token_ids": encoder_prompt_token_ids,
            "num_cached_tokens": num_cached_tokens,
            "multi_modal_placeholders": seq_group.multi_modal_placeholders
        }

        if use_cache:
            request_output = seq_group.cached_request_output
            request_output.__init__(**init_kwargs)  # type: ignore
        else:
            request_output = cls(**init_kwargs)  # type: ignore

        return request_output

    def __repr__(self) -> str:
        return (f"RequestOutput(request_id={self.request_id}, "
                f"prompt={self.prompt!r}, "
                f"prompt_token_ids={self.prompt_token_ids}, "
                f"encoder_prompt={self.encoder_prompt!r}, "
                f"encoder_prompt_token_ids={self.encoder_prompt_token_ids}, "
                f"prompt_logprobs={self.prompt_logprobs}, "
                f"outputs={self.outputs}, "
                f"finished={self.finished}, "
                f"metrics={self.metrics}, "
                f"lora_request={self.lora_request}, "
                f"num_cached_tokens={self.num_cached_tokens}, "
                f"multi_modal_placeholders={self.multi_modal_placeholders})")

encoder_prompt instance-attribute

encoder_prompt = encoder_prompt

encoder_prompt_token_ids instance-attribute

encoder_prompt_token_ids = encoder_prompt_token_ids

finished instance-attribute

finished = finished

kv_transfer_params instance-attribute

kv_transfer_params = kv_transfer_params

lora_request instance-attribute

lora_request = lora_request

metrics instance-attribute

metrics = metrics

multi_modal_placeholders instance-attribute

multi_modal_placeholders = multi_modal_placeholders or {}

num_cached_tokens instance-attribute

num_cached_tokens = num_cached_tokens

outputs instance-attribute

outputs = outputs

prompt instance-attribute

prompt = prompt

prompt_logprobs instance-attribute

prompt_logprobs = prompt_logprobs

prompt_token_ids instance-attribute

prompt_token_ids = prompt_token_ids

request_id instance-attribute

request_id = request_id

__init__

__init__(
    request_id: str,
    prompt: Optional[str],
    prompt_token_ids: Optional[list[int]],
    prompt_logprobs: Optional[PromptLogprobs],
    outputs: list[CompletionOutput],
    finished: bool,
    metrics: Optional[RequestMetrics] = None,
    lora_request: Optional[LoRARequest] = None,
    encoder_prompt: Optional[str] = None,
    encoder_prompt_token_ids: Optional[list[int]] = None,
    num_cached_tokens: Optional[int] = None,
    *,
    multi_modal_placeholders: Optional[
        MultiModalPlaceholderDict
    ] = None,
    kv_transfer_params: Optional[dict[str, Any]] = None,
    **kwargs: Any,
) -> None
Source code in vllm/outputs.py
def __init__(
    self,
    request_id: str,
    prompt: Optional[str],
    prompt_token_ids: Optional[list[int]],
    prompt_logprobs: Optional[PromptLogprobs],
    outputs: list[CompletionOutput],
    finished: bool,
    metrics: Optional[RequestMetrics] = None,
    lora_request: Optional[LoRARequest] = None,
    encoder_prompt: Optional[str] = None,
    encoder_prompt_token_ids: Optional[list[int]] = None,
    num_cached_tokens: Optional[int] = None,
    *,
    multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None,
    kv_transfer_params: Optional[dict[str, Any]] = None,
    # Forward compatibility, code that uses args added in new release can
    # still run with older versions of vLLM without breaking.
    **kwargs: Any,
) -> None:
    if kwargs:
        logger.warning_once("RequestOutput: Ignoring extra arguments: %s",
                            str(kwargs))
    self.request_id = request_id
    self.prompt = prompt
    self.prompt_token_ids = prompt_token_ids
    self.multi_modal_placeholders = multi_modal_placeholders or {}
    self.prompt_logprobs = prompt_logprobs
    self.outputs = outputs
    self.finished = finished
    self.metrics = metrics
    self.lora_request = lora_request
    self.encoder_prompt = encoder_prompt
    self.encoder_prompt_token_ids = encoder_prompt_token_ids
    self.num_cached_tokens = num_cached_tokens
    self.kv_transfer_params = kv_transfer_params

__repr__

__repr__() -> str
Source code in vllm/outputs.py
def __repr__(self) -> str:
    return (f"RequestOutput(request_id={self.request_id}, "
            f"prompt={self.prompt!r}, "
            f"prompt_token_ids={self.prompt_token_ids}, "
            f"encoder_prompt={self.encoder_prompt!r}, "
            f"encoder_prompt_token_ids={self.encoder_prompt_token_ids}, "
            f"prompt_logprobs={self.prompt_logprobs}, "
            f"outputs={self.outputs}, "
            f"finished={self.finished}, "
            f"metrics={self.metrics}, "
            f"lora_request={self.lora_request}, "
            f"num_cached_tokens={self.num_cached_tokens}, "
            f"multi_modal_placeholders={self.multi_modal_placeholders})")

add

add(next_output: RequestOutput, aggregate: bool) -> None

Merge subsequent RequestOutput into this one

Source code in vllm/outputs.py
def add(self, next_output: "RequestOutput", aggregate: bool) -> None:
    """Merge subsequent RequestOutput into this one"""

    self.finished |= next_output.finished
    self.kv_transfer_params = next_output.kv_transfer_params

    for next_completion in next_output.outputs:
        for i, completion in enumerate(self.outputs):
            if completion.index == next_completion.index:
                if aggregate:
                    # Merge outputs with same index
                    completion.text += next_completion.text
                    if not isinstance(completion.token_ids,
                                      MutableSequence):
                        completion.token_ids = list(completion.token_ids)
                    completion.token_ids.extend(next_completion.token_ids)
                    if next_completion.logprobs:
                        assert completion.logprobs is not None
                        completion.logprobs.extend(
                            next_completion.logprobs)
                    completion.cumulative_logprob = (
                        next_completion.cumulative_logprob)
                    completion.finish_reason = next_completion.finish_reason
                    completion.stop_reason = next_completion.stop_reason
                else:
                    # Replace the output with the new one
                    self.outputs[i] = next_completion
                break
        else:
            self.outputs.append(next_completion)

from_seq_group classmethod

from_seq_group(
    seq_group: SequenceGroup,
    use_cache: bool,
    seq_id_to_seq_group: dict[str, SequenceGroupBase],
) -> Optional[RequestOutput]
Source code in vllm/outputs.py
@classmethod
def from_seq_group(
    cls, seq_group: SequenceGroup, use_cache: bool,
    seq_id_to_seq_group: dict[str, SequenceGroupBase]
) -> Optional["RequestOutput"]:
    finished = seq_group.is_finished()

    if seq_group.request_id in seq_id_to_seq_group:
        group: SequenceGroupBase = seq_id_to_seq_group[
            seq_group.request_id]
        assembled_seq_group = group.maybe_assemble_group(seq_group)
        if finished:
            group.finish_seq(seq_group)
        if assembled_seq_group is None:
            return None

        # clear finished seq in seq_id_to_seq_group
        if len(group.to_be_finished) == 0:
            for sub_request_id in list(group.seq_id_to_index.keys()):
                if sub_request_id in seq_id_to_seq_group:
                    del seq_id_to_seq_group[sub_request_id]

        return cls.from_seq_group(assembled_seq_group, use_cache,
                                  seq_id_to_seq_group)

    sampling_params = seq_group.sampling_params
    if sampling_params is None:
        raise ValueError(
            "Sampling parameters are missing for a CompletionRequest.")

    if sampling_params.output_kind == RequestOutputKind.FINAL_ONLY and (
            not finished):
        return None

    # Init cache (if needed)
    if use_cache and seq_group.cached_request_output is None:
        seq_group.cached_request_output = RequestOutput(  # type: ignore
            request_id="",
            prompt=None,
            prompt_token_ids=[],
            prompt_logprobs=None,
            outputs=[],
            finished=False)

    top_n_seqs = seq_group.get_seqs()

    # Create the outputs.
    # NOTE: We need omit logprobs here explicitly because the sequence
    # always has the logprobs of the sampled tokens even if the
    # logprobs are not requested.
    include_logprobs = sampling_params.logprobs is not None
    text_buffer_length = sampling_params.output_text_buffer_length
    delta = sampling_params.output_kind == RequestOutputKind.DELTA

    outputs = []
    include_prompt = True
    # num_cached_tokens should be the same for all the sequences
    num_cached_tokens = None
    for i, seq in enumerate(top_n_seqs):
        output_text = seq.get_output_text_to_return(
            text_buffer_length, delta)

        output_token_ids = seq.get_output_token_ids_to_return(delta)
        num_output_tokens = 1 if isinstance(output_token_ids,
                                            int) else len(output_token_ids)
        num_cached_tokens = seq.data.get_num_cached_tokens()

        output_logprobs = seq.output_logprobs if include_logprobs else None

        if delta:
            # Slice logprobs delta if applicable
            if output_logprobs:
                # num_output_tokens can be 0 when n > 1 and request finishes
                # before the others
                if num_output_tokens > 0:
                    output_logprobs = output_logprobs[-num_output_tokens:]
                else:
                    output_logprobs = None
            # Don't include prompt if this is after the first output
            # containing decode token ids
            if include_prompt and seq.get_output_len() > num_output_tokens:
                include_prompt = False

        if use_cache:
            # Get cached output object
            cached_outputs = seq_group.cached_request_output.outputs  # type: ignore
            if i >= len(cached_outputs):
                cached_outputs.append(
                    CompletionOutput(index=i,
                                     text="",
                                     token_ids=[],
                                     cumulative_logprob=None,
                                     logprobs=None,
                                     finish_reason=None,
                                     stop_reason=None))
            output = cached_outputs[i]

            # Init cached output object
            assert output.index == i
            output.text = output_text

            if isinstance(output_token_ids, int):
                output.token_ids.clear()
                output.token_ids.append(output_token_ids)
            else:
                output.token_ids = output_token_ids

            output.cumulative_logprob = seq.get_cumulative_logprob() \
                if include_logprobs else None
            output.logprobs = output_logprobs
            output.finish_reason = SequenceStatus.get_finished_reason(
                seq.status)
            output.stop_reason = seq.stop_reason

        else:
            output = CompletionOutput(
                top_n_seqs.index(seq), output_text, [output_token_ids]
                if isinstance(output_token_ids, int) else output_token_ids,
                seq.get_cumulative_logprob() if include_logprobs else None,
                output_logprobs,
                SequenceStatus.get_finished_reason(seq.status),
                seq.stop_reason)

        outputs.append(output)

    # Every sequence in the sequence group should have the same prompt.
    if include_prompt:
        prompt = seq_group.prompt
        prompt_token_ids = seq_group.prompt_token_ids
        encoder_prompt = seq_group.encoder_prompt
        encoder_prompt_token_ids = seq_group.encoder_prompt_token_ids
        prompt_logprobs = seq_group.prompt_logprobs
    else:
        prompt = None
        prompt_token_ids = None
        encoder_prompt = None
        encoder_prompt_token_ids = None
        prompt_logprobs = None
    finished_time = time.time() if finished else None
    seq_group.set_finished_time(finished_time)

    init_kwargs = {
        "request_id": seq_group.request_id,
        "prompt": prompt,
        "prompt_token_ids": prompt_token_ids,
        "prompt_logprobs": prompt_logprobs,
        "outputs": outputs,
        "finished": finished,
        "metrics": seq_group.metrics,
        "lora_request": seq_group.lora_request,
        "encoder_prompt": encoder_prompt,
        "encoder_prompt_token_ids": encoder_prompt_token_ids,
        "num_cached_tokens": num_cached_tokens,
        "multi_modal_placeholders": seq_group.multi_modal_placeholders
    }

    if use_cache:
        request_output = seq_group.cached_request_output
        request_output.__init__(**init_kwargs)  # type: ignore
    else:
        request_output = cls(**init_kwargs)  # type: ignore

    return request_output

SamplingParams

Bases: Struct

Sampling parameters for text generation.

Overall, we follow the sampling parameters from the OpenAI text completion API (https://platform.openai.com/docs/api-reference/completions/create). In addition, we support beam search, which is not supported by OpenAI.

Source code in vllm/sampling_params.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
class SamplingParams(
        msgspec.Struct,
        omit_defaults=True,  # type: ignore[call-arg]
        # required for @cached_property.
        dict=True):  # type: ignore[call-arg]
    """Sampling parameters for text generation.

    Overall, we follow the sampling parameters from the OpenAI text completion
    API (https://platform.openai.com/docs/api-reference/completions/create).
    In addition, we support beam search, which is not supported by OpenAI.
    """

    n: int = 1
    """Number of output sequences to return for the given prompt."""
    best_of: Optional[int] = None
    """Number of output sequences that are generated from the prompt. From
    these `best_of` sequences, the top `n` sequences are returned. `best_of`
    must be greater than or equal to `n`. By default, `best_of` is set to `n`.
    Warning, this is only supported in V0."""
    _real_n: Optional[int] = None
    presence_penalty: float = 0.0
    """Penalizes new tokens based on whether they appear in the generated text
    so far. Values > 0 encourage the model to use new tokens, while values < 0
    encourage the model to repeat tokens."""
    frequency_penalty: float = 0.0
    """Penalizes new tokens based on their frequency in the generated text so
    far. Values > 0 encourage the model to use new tokens, while values < 0
    encourage the model to repeat tokens."""
    repetition_penalty: float = 1.0
    """Penalizes new tokens based on whether they appear in the prompt and the
    generated text so far. Values > 1 encourage the model to use new tokens,
    while values < 1 encourage the model to repeat tokens."""
    temperature: float = 1.0
    """Controls the randomness of the sampling. Lower values make the model
    more deterministic, while higher values make the model more random. Zero
    means greedy sampling."""
    top_p: float = 1.0
    """Controls the cumulative probability of the top tokens to consider. Must
    be in (0, 1]. Set to 1 to consider all tokens."""
    top_k: int = 0
    """Controls the number of top tokens to consider. Set to 0 (or -1) to
    consider all tokens."""
    min_p: float = 0.0
    """Represents the minimum probability for a token to be considered,
    relative to the probability of the most likely token. Must be in [0, 1].
    Set to 0 to disable this."""
    seed: Optional[int] = None
    """Random seed to use for the generation."""
    stop: Optional[Union[str, list[str]]] = None
    """String(s) that stop the generation when they are generated. The returned
    output will not contain the stop strings."""
    stop_token_ids: Optional[list[int]] = None
    """Token IDs that stop the generation when they are generated. The returned
    output will contain the stop tokens unless the stop tokens are special
    tokens."""
    ignore_eos: bool = False
    """Whether to ignore the EOS token and continue generating
    tokens after the EOS token is generated."""
    max_tokens: Optional[int] = 16
    """Maximum number of tokens to generate per output sequence."""
    min_tokens: int = 0
    """Minimum number of tokens to generate per output sequence before EOS or
    `stop_token_ids` can be generated"""
    logprobs: Optional[int] = None
    """Number of log probabilities to return per output token. When set to
    `None`, no probability is returned. If set to a non-`None` value, the
    result includes the log probabilities of the specified number of most
    likely tokens, as well as the chosen tokens. Note that the implementation
    follows the OpenAI API: The API will always return the log probability of
    the sampled token, so there may be up to `logprobs+1` elements in the
    response. When set to -1, return all `vocab_size` log probabilities."""
    prompt_logprobs: Optional[int] = None
    """Number of log probabilities to return per prompt token."""
    # NOTE: This parameter is only exposed at the engine level for now.
    # It is not exposed in the OpenAI API server, as the OpenAI API does
    # not support returning only a list of token IDs.
    detokenize: bool = True
    """Whether to detokenize the output."""
    skip_special_tokens: bool = True
    """Whether to skip special tokens in the output."""
    spaces_between_special_tokens: bool = True
    """Whether to add spaces between special tokens in the output."""
    # Optional[list[LogitsProcessor]] type. We use Any here because
    # Optional[list[LogitsProcessor]] type is not supported by msgspec.
    logits_processors: Optional[Any] = None
    """Functions that modify logits based on previously generated tokens, and
    optionally prompt tokens as a first argument."""
    include_stop_str_in_output: bool = False
    """Whether to include the stop strings in output text."""
    truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=1)]] = None
    """If set to -1, will use the truncation size supported by the model. If
    set to an integer k, will use only the last k tokens from the prompt
    (i.e., left truncation). If set to `None`, truncation is disabled."""
    output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE

    # The below fields are not supposed to be used as an input.
    # They are set in post_init.
    output_text_buffer_length: int = 0
    _all_stop_token_ids: set[int] = msgspec.field(default_factory=set)

    # Fields used to construct logits processors
    guided_decoding: Optional[GuidedDecodingParams] = None
    """If provided, the engine will construct a guided decoding logits
    processor from these parameters."""
    logit_bias: Optional[dict[int, float]] = None
    """If provided, the engine will construct a logits processor that applies
    these logit biases."""
    allowed_token_ids: Optional[list[int]] = None
    """If provided, the engine will construct a logits processor which only
    retains scores for the given token ids."""
    extra_args: Optional[dict[str, Any]] = None
    """Arbitrary additional args, that can be used by custom sampling
    implementations, plugins, etc. Not used by any in-tree sampling
    implementations."""

    # Fields used for bad words
    bad_words: Optional[list[str]] = None
    """Words that are not allowed to be generated. More precisely, only the
    last token of a corresponding token sequence is not allowed when the next
    generated token can complete the sequence."""
    _bad_words_token_ids: Optional[list[list[int]]] = None

    @staticmethod
    def from_optional(
        n: Optional[int] = 1,
        best_of: Optional[int] = None,
        presence_penalty: Optional[float] = 0.0,
        frequency_penalty: Optional[float] = 0.0,
        repetition_penalty: Optional[float] = 1.0,
        temperature: Optional[float] = 1.0,
        top_p: Optional[float] = 1.0,
        top_k: int = 0,
        min_p: float = 0.0,
        seed: Optional[int] = None,
        stop: Optional[Union[str, list[str]]] = None,
        stop_token_ids: Optional[list[int]] = None,
        bad_words: Optional[list[str]] = None,
        include_stop_str_in_output: bool = False,
        ignore_eos: bool = False,
        max_tokens: Optional[int] = 16,
        min_tokens: int = 0,
        logprobs: Optional[int] = None,
        prompt_logprobs: Optional[int] = None,
        detokenize: bool = True,
        skip_special_tokens: bool = True,
        spaces_between_special_tokens: bool = True,
        logits_processors: Optional[list[LogitsProcessor]] = None,
        truncate_prompt_tokens: Optional[Annotated[int,
                                                   msgspec.Meta(ge=1)]] = None,
        output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
        guided_decoding: Optional[GuidedDecodingParams] = None,
        logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None,
        allowed_token_ids: Optional[list[int]] = None,
        extra_args: Optional[dict[str, Any]] = None,
    ) -> "SamplingParams":
        if logit_bias is not None:
            # Convert token_id to integer
            # Clamp the bias between -100 and 100 per OpenAI API spec
            logit_bias = {
                int(token): min(100.0, max(-100.0, bias))
                for token, bias in logit_bias.items()
            }

        return SamplingParams(
            n=1 if n is None else n,
            best_of=best_of,
            presence_penalty=0.0
            if presence_penalty is None else presence_penalty,
            frequency_penalty=0.0
            if frequency_penalty is None else frequency_penalty,
            repetition_penalty=1.0
            if repetition_penalty is None else repetition_penalty,
            temperature=1.0 if temperature is None else temperature,
            top_p=1.0 if top_p is None else top_p,
            top_k=top_k,
            min_p=min_p,
            seed=seed,
            stop=stop,
            stop_token_ids=stop_token_ids,
            bad_words=bad_words,
            include_stop_str_in_output=include_stop_str_in_output,
            ignore_eos=ignore_eos,
            max_tokens=max_tokens,
            min_tokens=min_tokens,
            logprobs=logprobs,
            prompt_logprobs=prompt_logprobs,
            detokenize=detokenize,
            skip_special_tokens=skip_special_tokens,
            spaces_between_special_tokens=spaces_between_special_tokens,
            logits_processors=logits_processors,
            truncate_prompt_tokens=truncate_prompt_tokens,
            output_kind=output_kind,
            guided_decoding=guided_decoding,
            logit_bias=logit_bias,
            allowed_token_ids=allowed_token_ids,
            extra_args=extra_args,
        )

    def __post_init__(self) -> None:
        # how we deal with `best_of``:
        # if `best_of`` is not set, we default to `n`;
        # if `best_of`` is set, we set `n`` to `best_of`,
        # and set `_real_n`` to the original `n`.
        # when we return the result, we will check
        # if we need to return `n` or `_real_n` results
        if self.best_of:
            if self.best_of < self.n:
                raise ValueError(
                    f"best_of must be greater than or equal to n, "
                    f"got n={self.n} and best_of={self.best_of}.")
            if not self._real_n:
                self._real_n = self.n
                self.n = self.best_of

        if 0 < self.temperature < _MAX_TEMP:
            logger.warning(
                "temperature %s is less than %s, which may cause numerical "
                "errors nan or inf in tensors. We have maxed it out to %s.",
                self.temperature, _MAX_TEMP, _MAX_TEMP)
            self.temperature = max(self.temperature, _MAX_TEMP)

        if self.seed == -1:
            self.seed = None

        if self.stop is None:
            self.stop = []
        elif isinstance(self.stop, str):
            self.stop = [self.stop]

        if self.stop_token_ids is None:
            self.stop_token_ids = []

        if self.bad_words is None:
            self.bad_words = []

        if self.logprobs is True:
            self.logprobs = 1

        if self.prompt_logprobs is True:
            self.prompt_logprobs = 1

        # Number of characters to hold back for stop string evaluation
        # until sequence is finished.
        if self.stop and not self.include_stop_str_in_output:
            self.output_text_buffer_length = max(len(s) for s in self.stop) - 1

        self._verify_args()

        if self.temperature < _SAMPLING_EPS:
            # Zero temperature means greedy sampling.
            self.top_p = 1.0
            self.top_k = 0
            self.min_p = 0.0
            self._verify_greedy_sampling()

        # eos_token_id is added to this by the engine
        self._all_stop_token_ids.update(self.stop_token_ids)

    def _verify_args(self) -> None:
        if not isinstance(self.n, int):
            raise ValueError(f"n must be an int, but is of "
                             f"type {type(self.n)}")
        if self.n < 1:
            raise ValueError(f"n must be at least 1, got {self.n}.")
        if self.best_of is not None:
            if not isinstance(self.best_of, int):
                raise ValueError(
                    f"best_of must be an integer, got {type(self.best_of)}")
            if self.best_of < 1:
                raise ValueError(
                    f"best_of must be at least 1, got {self.best_of}")
            if self.best_of < self.n:
                raise ValueError(
                    f"best_of must be greater than or equal to n, "
                    f"got n={self.n} and best_of={self.best_of}.")
        if not -2.0 <= self.presence_penalty <= 2.0:
            raise ValueError("presence_penalty must be in [-2, 2], got "
                             f"{self.presence_penalty}.")
        if not -2.0 <= self.frequency_penalty <= 2.0:
            raise ValueError("frequency_penalty must be in [-2, 2], got "
                             f"{self.frequency_penalty}.")
        if self.repetition_penalty <= 0.0:
            raise ValueError(
                "repetition_penalty must be greater than zero, got "
                f"{self.repetition_penalty}.")
        if self.temperature < 0.0:
            raise ValueError(
                f"temperature must be non-negative, got {self.temperature}.")
        if not 0.0 < self.top_p <= 1.0:
            raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
        # quietly accept -1 as disabled, but prefer 0
        if self.top_k < -1:
            raise ValueError(f"top_k must be 0 (disable), or at least 1, "
                             f"got {self.top_k}.")
        if not isinstance(self.top_k, int):
            raise TypeError(
                f"top_k must be an integer, got {type(self.top_k).__name__}")
        if not 0.0 <= self.min_p <= 1.0:
            raise ValueError("min_p must be in [0, 1], got "
                             f"{self.min_p}.")
        if self.max_tokens is not None and self.max_tokens < 1:
            raise ValueError(
                f"max_tokens must be at least 1, got {self.max_tokens}.")
        if self.min_tokens < 0:
            raise ValueError(f"min_tokens must be greater than or equal to 0, "
                             f"got {self.min_tokens}.")
        if self.max_tokens is not None and self.min_tokens > self.max_tokens:
            raise ValueError(
                f"min_tokens must be less than or equal to "
                f"max_tokens={self.max_tokens}, got {self.min_tokens}.")
        if (self.logprobs is not None and self.logprobs != -1
                and self.logprobs < 0):
            raise ValueError(
                f"logprobs must be non-negative or -1, got {self.logprobs}.")
        if self.prompt_logprobs is not None and self.prompt_logprobs < 0:
            raise ValueError(f"prompt_logprobs must be non-negative, got "
                             f"{self.prompt_logprobs}.")
        if (self.truncate_prompt_tokens is not None
                and self.truncate_prompt_tokens < 1):
            raise ValueError(f"truncate_prompt_tokens must be >= 1, "
                             f"got {self.truncate_prompt_tokens}")
        assert isinstance(self.stop_token_ids, list)
        if not all(isinstance(st_id, int) for st_id in self.stop_token_ids):
            raise ValueError(f"stop_token_ids must contain only integers, "
                             f"got {self.stop_token_ids}.")
        assert isinstance(self.stop, list)
        if any(not stop_str for stop_str in self.stop):
            raise ValueError("stop cannot contain an empty string.")
        if self.stop and not self.detokenize:
            raise ValueError(
                "stop strings are only supported when detokenize is True. "
                "Set detokenize=True to use stop.")
        if self.best_of != self._real_n and self.output_kind == (
                RequestOutputKind.DELTA):
            raise ValueError("best_of must equal n to use output_kind=DELTA")

    def _verify_greedy_sampling(self) -> None:
        if self.n > 1:
            raise ValueError("n must be 1 when using greedy sampling, "
                             f"got {self.n}.")

    def update_from_generation_config(
            self,
            generation_config: dict[str, Any],
            model_eos_token_id: Optional[int] = None) -> None:
        """Update if there are non-default values from generation_config"""

        if model_eos_token_id is not None:
            # Add the eos token id into the sampling_params to support
            # min_tokens processing.
            self._all_stop_token_ids.add(model_eos_token_id)

        # Update eos_token_id for generation
        if (eos_ids := generation_config.get("eos_token_id")) is not None:
            # it can be either int or list of int
            eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids)
            if model_eos_token_id is not None:
                # We don't need to include the primary eos_token_id in
                # stop_token_ids since it's handled separately for stopping
                # purposes.
                eos_ids.discard(model_eos_token_id)
            if eos_ids:
                self._all_stop_token_ids.update(eos_ids)
                if not self.ignore_eos:
                    eos_ids.update(self.stop_token_ids)
                    self.stop_token_ids = list(eos_ids)

    def update_from_tokenizer(self, tokenizer: AnyTokenizer) -> None:
        if not self.bad_words:
            return
        self._bad_words_token_ids = []
        for bad_word in self.bad_words:
            # To prohibit words both at the beginning
            # and in the middle of text
            # (related to add_prefix_space tokenizer parameter)
            for add_prefix_space in [False, True]:
                prefix = " " if add_prefix_space else ""
                prompt = prefix + bad_word.lstrip()
                prompt_token_ids = tokenizer.encode(text=prompt,
                                                    add_special_tokens=False)

                # If no space at the beginning
                # or if prefix space produces a new word token
                if (not add_prefix_space) or (
                        add_prefix_space and prompt_token_ids[0]
                        != self._bad_words_token_ids[-1][0]
                        and len(prompt_token_ids) == len(
                            self._bad_words_token_ids[-1])):
                    self._bad_words_token_ids.append(prompt_token_ids)

        invalid_token_ids = [
            token_id for bad_words_token_ids in self._bad_words_token_ids
            for token_id in bad_words_token_ids
            if token_id < 0 or token_id > tokenizer.max_token_id
        ]
        if len(invalid_token_ids) > 0:
            raise ValueError(
                f"The model vocabulary size is {tokenizer.max_token_id+1},"
                f" but the following tokens"
                f" were specified as bad: {invalid_token_ids}."
                f" All token id values should be integers satisfying:"
                f" 0 <= token_id <= {tokenizer.max_token_id}.")

    @cached_property
    def sampling_type(self) -> SamplingType:
        if self.temperature < _SAMPLING_EPS:
            return SamplingType.GREEDY
        if self.seed is not None:
            return SamplingType.RANDOM_SEED
        return SamplingType.RANDOM

    @property
    def all_stop_token_ids(self) -> set[int]:
        return self._all_stop_token_ids

    @property
    def bad_words_token_ids(self) -> Optional[list[list[int]]]:
        # For internal use only. Backward compatibility not guaranteed
        return self._bad_words_token_ids

    def clone(self) -> "SamplingParams":
        """Deep copy, but maybe not the LogitsProcessor objects.

        LogitsProcessor objects may contain an arbitrary, nontrivial amount of
        data that is expensive to copy. However, if not copied, the processor
        needs to support parallel decoding for multiple sequences
        See https://github.com/vllm-project/vllm/issues/3087
        """

        logit_processor_refs = None if self.logits_processors is None else {
            id(lp): lp.clone() if hasattr(lp, 'clone') else lp
            for lp in self.logits_processors
        }
        return copy.deepcopy(self, memo=logit_processor_refs)

    def __repr__(self) -> str:
        return (
            f"SamplingParams(n={self.n}, "
            f"presence_penalty={self.presence_penalty}, "
            f"frequency_penalty={self.frequency_penalty}, "
            f"repetition_penalty={self.repetition_penalty}, "
            f"temperature={self.temperature}, "
            f"top_p={self.top_p}, "
            f"top_k={self.top_k}, "
            f"min_p={self.min_p}, "
            f"seed={self.seed}, "
            f"stop={self.stop}, "
            f"stop_token_ids={self.stop_token_ids}, "
            f"bad_words={self.bad_words}, "
            f"include_stop_str_in_output={self.include_stop_str_in_output}, "
            f"ignore_eos={self.ignore_eos}, "
            f"max_tokens={self.max_tokens}, "
            f"min_tokens={self.min_tokens}, "
            f"logprobs={self.logprobs}, "
            f"prompt_logprobs={self.prompt_logprobs}, "
            f"skip_special_tokens={self.skip_special_tokens}, "
            "spaces_between_special_tokens="
            f"{self.spaces_between_special_tokens}, "
            f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
            f"guided_decoding={self.guided_decoding}, "
            f"extra_args={self.extra_args})")

_all_stop_token_ids class-attribute instance-attribute

_all_stop_token_ids: set[int] = field(default_factory=set)

_bad_words_token_ids class-attribute instance-attribute

_bad_words_token_ids: Optional[list[list[int]]] = None

_real_n class-attribute instance-attribute

_real_n: Optional[int] = None

all_stop_token_ids property

all_stop_token_ids: set[int]

allowed_token_ids class-attribute instance-attribute

allowed_token_ids: Optional[list[int]] = None

If provided, the engine will construct a logits processor which only retains scores for the given token ids.

bad_words class-attribute instance-attribute

bad_words: Optional[list[str]] = None

Words that are not allowed to be generated. More precisely, only the last token of a corresponding token sequence is not allowed when the next generated token can complete the sequence.

bad_words_token_ids property

bad_words_token_ids: Optional[list[list[int]]]

best_of class-attribute instance-attribute

best_of: Optional[int] = None

Number of output sequences that are generated from the prompt. From these best_of sequences, the top n sequences are returned. best_of must be greater than or equal to n. By default, best_of is set to n. Warning, this is only supported in V0.

detokenize class-attribute instance-attribute

detokenize: bool = True

Whether to detokenize the output.

extra_args class-attribute instance-attribute

extra_args: Optional[dict[str, Any]] = None

Arbitrary additional args, that can be used by custom sampling implementations, plugins, etc. Not used by any in-tree sampling implementations.

frequency_penalty class-attribute instance-attribute

frequency_penalty: float = 0.0

Penalizes new tokens based on their frequency in the generated text so far. Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to repeat tokens.

guided_decoding class-attribute instance-attribute

guided_decoding: Optional[GuidedDecodingParams] = None

If provided, the engine will construct a guided decoding logits processor from these parameters.

ignore_eos class-attribute instance-attribute

ignore_eos: bool = False

Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.

include_stop_str_in_output class-attribute instance-attribute

include_stop_str_in_output: bool = False

Whether to include the stop strings in output text.

logit_bias class-attribute instance-attribute

logit_bias: Optional[dict[int, float]] = None

If provided, the engine will construct a logits processor that applies these logit biases.

logits_processors class-attribute instance-attribute

logits_processors: Optional[Any] = None

Functions that modify logits based on previously generated tokens, and optionally prompt tokens as a first argument.

logprobs class-attribute instance-attribute

logprobs: Optional[int] = None

Number of log probabilities to return per output token. When set to None, no probability is returned. If set to a non-None value, the result includes the log probabilities of the specified number of most likely tokens, as well as the chosen tokens. Note that the implementation follows the OpenAI API: The API will always return the log probability of the sampled token, so there may be up to logprobs+1 elements in the response. When set to -1, return all vocab_size log probabilities.

max_tokens class-attribute instance-attribute

max_tokens: Optional[int] = 16

Maximum number of tokens to generate per output sequence.

min_p class-attribute instance-attribute

min_p: float = 0.0

Represents the minimum probability for a token to be considered, relative to the probability of the most likely token. Must be in [0, 1]. Set to 0 to disable this.

min_tokens class-attribute instance-attribute

min_tokens: int = 0

Minimum number of tokens to generate per output sequence before EOS or stop_token_ids can be generated

n class-attribute instance-attribute

n: int = 1

Number of output sequences to return for the given prompt.

output_kind class-attribute instance-attribute

output_text_buffer_length class-attribute instance-attribute

output_text_buffer_length: int = 0

presence_penalty class-attribute instance-attribute

presence_penalty: float = 0.0

Penalizes new tokens based on whether they appear in the generated text so far. Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to repeat tokens.

prompt_logprobs class-attribute instance-attribute

prompt_logprobs: Optional[int] = None

Number of log probabilities to return per prompt token.

repetition_penalty class-attribute instance-attribute

repetition_penalty: float = 1.0

Penalizes new tokens based on whether they appear in the prompt and the generated text so far. Values > 1 encourage the model to use new tokens, while values < 1 encourage the model to repeat tokens.

sampling_type cached property

sampling_type: SamplingType

seed class-attribute instance-attribute

seed: Optional[int] = None

Random seed to use for the generation.

skip_special_tokens class-attribute instance-attribute

skip_special_tokens: bool = True

Whether to skip special tokens in the output.

spaces_between_special_tokens class-attribute instance-attribute

spaces_between_special_tokens: bool = True

Whether to add spaces between special tokens in the output.

stop class-attribute instance-attribute

stop: Optional[Union[str, list[str]]] = None

String(s) that stop the generation when they are generated. The returned output will not contain the stop strings.

stop_token_ids class-attribute instance-attribute

stop_token_ids: Optional[list[int]] = None

Token IDs that stop the generation when they are generated. The returned output will contain the stop tokens unless the stop tokens are special tokens.

temperature class-attribute instance-attribute

temperature: float = 1.0

Controls the randomness of the sampling. Lower values make the model more deterministic, while higher values make the model more random. Zero means greedy sampling.

top_k class-attribute instance-attribute

top_k: int = 0

Controls the number of top tokens to consider. Set to 0 (or -1) to consider all tokens.

top_p class-attribute instance-attribute

top_p: float = 1.0

Controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to 1 to consider all tokens.

truncate_prompt_tokens class-attribute instance-attribute

truncate_prompt_tokens: Optional[
    Annotated[int, Meta(ge=1)]
] = None

If set to -1, will use the truncation size supported by the model. If set to an integer k, will use only the last k tokens from the prompt (i.e., left truncation). If set to None, truncation is disabled.

__post_init__

__post_init__() -> None
Source code in vllm/sampling_params.py
def __post_init__(self) -> None:
    # how we deal with `best_of``:
    # if `best_of`` is not set, we default to `n`;
    # if `best_of`` is set, we set `n`` to `best_of`,
    # and set `_real_n`` to the original `n`.
    # when we return the result, we will check
    # if we need to return `n` or `_real_n` results
    if self.best_of:
        if self.best_of < self.n:
            raise ValueError(
                f"best_of must be greater than or equal to n, "
                f"got n={self.n} and best_of={self.best_of}.")
        if not self._real_n:
            self._real_n = self.n
            self.n = self.best_of

    if 0 < self.temperature < _MAX_TEMP:
        logger.warning(
            "temperature %s is less than %s, which may cause numerical "
            "errors nan or inf in tensors. We have maxed it out to %s.",
            self.temperature, _MAX_TEMP, _MAX_TEMP)
        self.temperature = max(self.temperature, _MAX_TEMP)

    if self.seed == -1:
        self.seed = None

    if self.stop is None:
        self.stop = []
    elif isinstance(self.stop, str):
        self.stop = [self.stop]

    if self.stop_token_ids is None:
        self.stop_token_ids = []

    if self.bad_words is None:
        self.bad_words = []

    if self.logprobs is True:
        self.logprobs = 1

    if self.prompt_logprobs is True:
        self.prompt_logprobs = 1

    # Number of characters to hold back for stop string evaluation
    # until sequence is finished.
    if self.stop and not self.include_stop_str_in_output:
        self.output_text_buffer_length = max(len(s) for s in self.stop) - 1

    self._verify_args()

    if self.temperature < _SAMPLING_EPS:
        # Zero temperature means greedy sampling.
        self.top_p = 1.0
        self.top_k = 0
        self.min_p = 0.0
        self._verify_greedy_sampling()

    # eos_token_id is added to this by the engine
    self._all_stop_token_ids.update(self.stop_token_ids)

__repr__

__repr__() -> str
Source code in vllm/sampling_params.py
def __repr__(self) -> str:
    return (
        f"SamplingParams(n={self.n}, "
        f"presence_penalty={self.presence_penalty}, "
        f"frequency_penalty={self.frequency_penalty}, "
        f"repetition_penalty={self.repetition_penalty}, "
        f"temperature={self.temperature}, "
        f"top_p={self.top_p}, "
        f"top_k={self.top_k}, "
        f"min_p={self.min_p}, "
        f"seed={self.seed}, "
        f"stop={self.stop}, "
        f"stop_token_ids={self.stop_token_ids}, "
        f"bad_words={self.bad_words}, "
        f"include_stop_str_in_output={self.include_stop_str_in_output}, "
        f"ignore_eos={self.ignore_eos}, "
        f"max_tokens={self.max_tokens}, "
        f"min_tokens={self.min_tokens}, "
        f"logprobs={self.logprobs}, "
        f"prompt_logprobs={self.prompt_logprobs}, "
        f"skip_special_tokens={self.skip_special_tokens}, "
        "spaces_between_special_tokens="
        f"{self.spaces_between_special_tokens}, "
        f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
        f"guided_decoding={self.guided_decoding}, "
        f"extra_args={self.extra_args})")

_verify_args

_verify_args() -> None
Source code in vllm/sampling_params.py
def _verify_args(self) -> None:
    if not isinstance(self.n, int):
        raise ValueError(f"n must be an int, but is of "
                         f"type {type(self.n)}")
    if self.n < 1:
        raise ValueError(f"n must be at least 1, got {self.n}.")
    if self.best_of is not None:
        if not isinstance(self.best_of, int):
            raise ValueError(
                f"best_of must be an integer, got {type(self.best_of)}")
        if self.best_of < 1:
            raise ValueError(
                f"best_of must be at least 1, got {self.best_of}")
        if self.best_of < self.n:
            raise ValueError(
                f"best_of must be greater than or equal to n, "
                f"got n={self.n} and best_of={self.best_of}.")
    if not -2.0 <= self.presence_penalty <= 2.0:
        raise ValueError("presence_penalty must be in [-2, 2], got "
                         f"{self.presence_penalty}.")
    if not -2.0 <= self.frequency_penalty <= 2.0:
        raise ValueError("frequency_penalty must be in [-2, 2], got "
                         f"{self.frequency_penalty}.")
    if self.repetition_penalty <= 0.0:
        raise ValueError(
            "repetition_penalty must be greater than zero, got "
            f"{self.repetition_penalty}.")
    if self.temperature < 0.0:
        raise ValueError(
            f"temperature must be non-negative, got {self.temperature}.")
    if not 0.0 < self.top_p <= 1.0:
        raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
    # quietly accept -1 as disabled, but prefer 0
    if self.top_k < -1:
        raise ValueError(f"top_k must be 0 (disable), or at least 1, "
                         f"got {self.top_k}.")
    if not isinstance(self.top_k, int):
        raise TypeError(
            f"top_k must be an integer, got {type(self.top_k).__name__}")
    if not 0.0 <= self.min_p <= 1.0:
        raise ValueError("min_p must be in [0, 1], got "
                         f"{self.min_p}.")
    if self.max_tokens is not None and self.max_tokens < 1:
        raise ValueError(
            f"max_tokens must be at least 1, got {self.max_tokens}.")
    if self.min_tokens < 0:
        raise ValueError(f"min_tokens must be greater than or equal to 0, "
                         f"got {self.min_tokens}.")
    if self.max_tokens is not None and self.min_tokens > self.max_tokens:
        raise ValueError(
            f"min_tokens must be less than or equal to "
            f"max_tokens={self.max_tokens}, got {self.min_tokens}.")
    if (self.logprobs is not None and self.logprobs != -1
            and self.logprobs < 0):
        raise ValueError(
            f"logprobs must be non-negative or -1, got {self.logprobs}.")
    if self.prompt_logprobs is not None and self.prompt_logprobs < 0:
        raise ValueError(f"prompt_logprobs must be non-negative, got "
                         f"{self.prompt_logprobs}.")
    if (self.truncate_prompt_tokens is not None
            and self.truncate_prompt_tokens < 1):
        raise ValueError(f"truncate_prompt_tokens must be >= 1, "
                         f"got {self.truncate_prompt_tokens}")
    assert isinstance(self.stop_token_ids, list)
    if not all(isinstance(st_id, int) for st_id in self.stop_token_ids):
        raise ValueError(f"stop_token_ids must contain only integers, "
                         f"got {self.stop_token_ids}.")
    assert isinstance(self.stop, list)
    if any(not stop_str for stop_str in self.stop):
        raise ValueError("stop cannot contain an empty string.")
    if self.stop and not self.detokenize:
        raise ValueError(
            "stop strings are only supported when detokenize is True. "
            "Set detokenize=True to use stop.")
    if self.best_of != self._real_n and self.output_kind == (
            RequestOutputKind.DELTA):
        raise ValueError("best_of must equal n to use output_kind=DELTA")

_verify_greedy_sampling

_verify_greedy_sampling() -> None
Source code in vllm/sampling_params.py
def _verify_greedy_sampling(self) -> None:
    if self.n > 1:
        raise ValueError("n must be 1 when using greedy sampling, "
                         f"got {self.n}.")

clone

clone() -> SamplingParams

Deep copy, but maybe not the LogitsProcessor objects.

LogitsProcessor objects may contain an arbitrary, nontrivial amount of data that is expensive to copy. However, if not copied, the processor needs to support parallel decoding for multiple sequences See https://github.com/vllm-project/vllm/issues/3087

Source code in vllm/sampling_params.py
def clone(self) -> "SamplingParams":
    """Deep copy, but maybe not the LogitsProcessor objects.

    LogitsProcessor objects may contain an arbitrary, nontrivial amount of
    data that is expensive to copy. However, if not copied, the processor
    needs to support parallel decoding for multiple sequences
    See https://github.com/vllm-project/vllm/issues/3087
    """

    logit_processor_refs = None if self.logits_processors is None else {
        id(lp): lp.clone() if hasattr(lp, 'clone') else lp
        for lp in self.logits_processors
    }
    return copy.deepcopy(self, memo=logit_processor_refs)

from_optional staticmethod

from_optional(
    n: Optional[int] = 1,
    best_of: Optional[int] = None,
    presence_penalty: Optional[float] = 0.0,
    frequency_penalty: Optional[float] = 0.0,
    repetition_penalty: Optional[float] = 1.0,
    temperature: Optional[float] = 1.0,
    top_p: Optional[float] = 1.0,
    top_k: int = 0,
    min_p: float = 0.0,
    seed: Optional[int] = None,
    stop: Optional[Union[str, list[str]]] = None,
    stop_token_ids: Optional[list[int]] = None,
    bad_words: Optional[list[str]] = None,
    include_stop_str_in_output: bool = False,
    ignore_eos: bool = False,
    max_tokens: Optional[int] = 16,
    min_tokens: int = 0,
    logprobs: Optional[int] = None,
    prompt_logprobs: Optional[int] = None,
    detokenize: bool = True,
    skip_special_tokens: bool = True,
    spaces_between_special_tokens: bool = True,
    logits_processors: Optional[
        list[LogitsProcessor]
    ] = None,
    truncate_prompt_tokens: Optional[
        Annotated[int, Meta(ge=1)]
    ] = None,
    output_kind: RequestOutputKind = CUMULATIVE,
    guided_decoding: Optional[GuidedDecodingParams] = None,
    logit_bias: Optional[
        Union[dict[int, float], dict[str, float]]
    ] = None,
    allowed_token_ids: Optional[list[int]] = None,
    extra_args: Optional[dict[str, Any]] = None,
) -> SamplingParams
Source code in vllm/sampling_params.py
@staticmethod
def from_optional(
    n: Optional[int] = 1,
    best_of: Optional[int] = None,
    presence_penalty: Optional[float] = 0.0,
    frequency_penalty: Optional[float] = 0.0,
    repetition_penalty: Optional[float] = 1.0,
    temperature: Optional[float] = 1.0,
    top_p: Optional[float] = 1.0,
    top_k: int = 0,
    min_p: float = 0.0,
    seed: Optional[int] = None,
    stop: Optional[Union[str, list[str]]] = None,
    stop_token_ids: Optional[list[int]] = None,
    bad_words: Optional[list[str]] = None,
    include_stop_str_in_output: bool = False,
    ignore_eos: bool = False,
    max_tokens: Optional[int] = 16,
    min_tokens: int = 0,
    logprobs: Optional[int] = None,
    prompt_logprobs: Optional[int] = None,
    detokenize: bool = True,
    skip_special_tokens: bool = True,
    spaces_between_special_tokens: bool = True,
    logits_processors: Optional[list[LogitsProcessor]] = None,
    truncate_prompt_tokens: Optional[Annotated[int,
                                               msgspec.Meta(ge=1)]] = None,
    output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
    guided_decoding: Optional[GuidedDecodingParams] = None,
    logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None,
    allowed_token_ids: Optional[list[int]] = None,
    extra_args: Optional[dict[str, Any]] = None,
) -> "SamplingParams":
    if logit_bias is not None:
        # Convert token_id to integer
        # Clamp the bias between -100 and 100 per OpenAI API spec
        logit_bias = {
            int(token): min(100.0, max(-100.0, bias))
            for token, bias in logit_bias.items()
        }

    return SamplingParams(
        n=1 if n is None else n,
        best_of=best_of,
        presence_penalty=0.0
        if presence_penalty is None else presence_penalty,
        frequency_penalty=0.0
        if frequency_penalty is None else frequency_penalty,
        repetition_penalty=1.0
        if repetition_penalty is None else repetition_penalty,
        temperature=1.0 if temperature is None else temperature,
        top_p=1.0 if top_p is None else top_p,
        top_k=top_k,
        min_p=min_p,
        seed=seed,
        stop=stop,
        stop_token_ids=stop_token_ids,
        bad_words=bad_words,
        include_stop_str_in_output=include_stop_str_in_output,
        ignore_eos=ignore_eos,
        max_tokens=max_tokens,
        min_tokens=min_tokens,
        logprobs=logprobs,
        prompt_logprobs=prompt_logprobs,
        detokenize=detokenize,
        skip_special_tokens=skip_special_tokens,
        spaces_between_special_tokens=spaces_between_special_tokens,
        logits_processors=logits_processors,
        truncate_prompt_tokens=truncate_prompt_tokens,
        output_kind=output_kind,
        guided_decoding=guided_decoding,
        logit_bias=logit_bias,
        allowed_token_ids=allowed_token_ids,
        extra_args=extra_args,
    )

update_from_generation_config

update_from_generation_config(
    generation_config: dict[str, Any],
    model_eos_token_id: Optional[int] = None,
) -> None

Update if there are non-default values from generation_config

Source code in vllm/sampling_params.py
def update_from_generation_config(
        self,
        generation_config: dict[str, Any],
        model_eos_token_id: Optional[int] = None) -> None:
    """Update if there are non-default values from generation_config"""

    if model_eos_token_id is not None:
        # Add the eos token id into the sampling_params to support
        # min_tokens processing.
        self._all_stop_token_ids.add(model_eos_token_id)

    # Update eos_token_id for generation
    if (eos_ids := generation_config.get("eos_token_id")) is not None:
        # it can be either int or list of int
        eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids)
        if model_eos_token_id is not None:
            # We don't need to include the primary eos_token_id in
            # stop_token_ids since it's handled separately for stopping
            # purposes.
            eos_ids.discard(model_eos_token_id)
        if eos_ids:
            self._all_stop_token_ids.update(eos_ids)
            if not self.ignore_eos:
                eos_ids.update(self.stop_token_ids)
                self.stop_token_ids = list(eos_ids)

update_from_tokenizer

update_from_tokenizer(tokenizer: AnyTokenizer) -> None
Source code in vllm/sampling_params.py
def update_from_tokenizer(self, tokenizer: AnyTokenizer) -> None:
    if not self.bad_words:
        return
    self._bad_words_token_ids = []
    for bad_word in self.bad_words:
        # To prohibit words both at the beginning
        # and in the middle of text
        # (related to add_prefix_space tokenizer parameter)
        for add_prefix_space in [False, True]:
            prefix = " " if add_prefix_space else ""
            prompt = prefix + bad_word.lstrip()
            prompt_token_ids = tokenizer.encode(text=prompt,
                                                add_special_tokens=False)

            # If no space at the beginning
            # or if prefix space produces a new word token
            if (not add_prefix_space) or (
                    add_prefix_space and prompt_token_ids[0]
                    != self._bad_words_token_ids[-1][0]
                    and len(prompt_token_ids) == len(
                        self._bad_words_token_ids[-1])):
                self._bad_words_token_ids.append(prompt_token_ids)

    invalid_token_ids = [
        token_id for bad_words_token_ids in self._bad_words_token_ids
        for token_id in bad_words_token_ids
        if token_id < 0 or token_id > tokenizer.max_token_id
    ]
    if len(invalid_token_ids) > 0:
        raise ValueError(
            f"The model vocabulary size is {tokenizer.max_token_id+1},"
            f" but the following tokens"
            f" were specified as bad: {invalid_token_ids}."
            f" All token id values should be integers satisfying:"
            f" 0 <= token_id <= {tokenizer.max_token_id}.")

ScoringOutput dataclass

The output data of one scoring output of a request.

Parameters:

Name Type Description Default
score float

The similarity score, which is a scalar value.

required
Source code in vllm/outputs.py
@dataclass
class ScoringOutput:
    """The output data of one scoring output of a request.

    Args:
        score: The similarity score, which is a scalar value.
    """
    score: float

    @staticmethod
    def from_base(pooling_output: PoolingOutput):
        # pooling_output shape:
        #   classify task: (num_classes) num_classes == 1
        #   embed task: a scalar value
        pooled_data = pooling_output.data.squeeze()
        if pooled_data.ndim != 0:
            raise ValueError("pooled_data should be a scalar score")

        return ScoringOutput(pooled_data.item())

    def __repr__(self) -> str:
        return f"ScoringOutput(score={self.score})"

score instance-attribute

score: float

__init__

__init__(score: float) -> None

__repr__

__repr__() -> str
Source code in vllm/outputs.py
def __repr__(self) -> str:
    return f"ScoringOutput(score={self.score})"

from_base staticmethod

from_base(pooling_output: PoolingOutput)
Source code in vllm/outputs.py
@staticmethod
def from_base(pooling_output: PoolingOutput):
    # pooling_output shape:
    #   classify task: (num_classes) num_classes == 1
    #   embed task: a scalar value
    pooled_data = pooling_output.data.squeeze()
    if pooled_data.ndim != 0:
        raise ValueError("pooled_data should be a scalar score")

    return ScoringOutput(pooled_data.item())

ScoringRequestOutput

Bases: PoolingRequestOutput[ScoringOutput]

Source code in vllm/outputs.py
class ScoringRequestOutput(PoolingRequestOutput[ScoringOutput]):

    @staticmethod
    def from_base(request_output: PoolingRequestOutput):
        return ScoringRequestOutput(
            request_id=request_output.request_id,
            outputs=ScoringOutput.from_base(request_output.outputs),
            prompt_token_ids=request_output.prompt_token_ids,
            finished=request_output.finished,
        )

from_base staticmethod

from_base(request_output: PoolingRequestOutput)
Source code in vllm/outputs.py
@staticmethod
def from_base(request_output: PoolingRequestOutput):
    return ScoringRequestOutput(
        request_id=request_output.request_id,
        outputs=ScoringOutput.from_base(request_output.outputs),
        prompt_token_ids=request_output.prompt_token_ids,
        finished=request_output.finished,
    )

TextPrompt

Bases: TypedDict

Schema for a text prompt.

Source code in vllm/inputs/data.py
class TextPrompt(TypedDict):
    """Schema for a text prompt."""

    prompt: str
    """The input text to be tokenized before passing to the model."""

    multi_modal_data: NotRequired["MultiModalDataDict"]
    """
    Optional multi-modal data to pass to the model,
    if the model supports it.
    """

    mm_processor_kwargs: NotRequired[dict[str, Any]]
    """
    Optional multi-modal processor kwargs to be forwarded to the
    multimodal input mapper & processor. Note that if multiple modalities
    have registered mappers etc for the model being considered, we attempt
    to pass the mm_processor_kwargs to each of them.
    """

    cache_salt: NotRequired[str]
    """
    Optional cache salt to be used for prefix caching.
    """

cache_salt instance-attribute

cache_salt: NotRequired[str]

Optional cache salt to be used for prefix caching.

mm_processor_kwargs instance-attribute

mm_processor_kwargs: NotRequired[dict[str, Any]]

Optional multi-modal processor kwargs to be forwarded to the multimodal input mapper & processor. Note that if multiple modalities have registered mappers etc for the model being considered, we attempt to pass the mm_processor_kwargs to each of them.

multi_modal_data instance-attribute

multi_modal_data: NotRequired[MultiModalDataDict]

Optional multi-modal data to pass to the model, if the model supports it.

prompt instance-attribute

prompt: str

The input text to be tokenized before passing to the model.

TokensPrompt

Bases: TypedDict

Schema for a tokenized prompt.

Source code in vllm/inputs/data.py
class TokensPrompt(TypedDict):
    """Schema for a tokenized prompt."""

    prompt_token_ids: list[int]
    """A list of token IDs to pass to the model."""

    token_type_ids: NotRequired[list[int]]
    """A list of token type IDs to pass to the cross encoder model."""

    multi_modal_data: NotRequired["MultiModalDataDict"]
    """
    Optional multi-modal data to pass to the model,
    if the model supports it.
    """

    mm_processor_kwargs: NotRequired[dict[str, Any]]
    """
    Optional multi-modal processor kwargs to be forwarded to the
    multimodal input mapper & processor. Note that if multiple modalities
    have registered mappers etc for the model being considered, we attempt
    to pass the mm_processor_kwargs to each of them.
    """

    cache_salt: NotRequired[str]
    """
    Optional cache salt to be used for prefix caching.
    """

cache_salt instance-attribute

cache_salt: NotRequired[str]

Optional cache salt to be used for prefix caching.

mm_processor_kwargs instance-attribute

mm_processor_kwargs: NotRequired[dict[str, Any]]

Optional multi-modal processor kwargs to be forwarded to the multimodal input mapper & processor. Note that if multiple modalities have registered mappers etc for the model being considered, we attempt to pass the mm_processor_kwargs to each of them.

multi_modal_data instance-attribute

multi_modal_data: NotRequired[MultiModalDataDict]

Optional multi-modal data to pass to the model, if the model supports it.

prompt_token_ids instance-attribute

prompt_token_ids: list[int]

A list of token IDs to pass to the model.

token_type_ids instance-attribute

token_type_ids: NotRequired[list[int]]

A list of token type IDs to pass to the cross encoder model.

__getattr__

__getattr__(name: str) -> Any
Source code in vllm/__init__.py
def __getattr__(name: str) -> typing.Any:
    from importlib import import_module

    if name in MODULE_ATTRS:
        module_name, attr_name = MODULE_ATTRS[name].split(":")
        module = import_module(module_name, __package__)
        return getattr(module, attr_name)
    else:
        raise AttributeError(
            f'module {__package__} has no attribute {name}')

initialize_ray_cluster

initialize_ray_cluster(
    parallel_config: ParallelConfig,
    ray_address: Optional[str] = None,
)

Initialize the distributed cluster with Ray.

it will connect to the Ray cluster and create a placement group for the workers, which includes the specification of the resources for each distributed worker.

Parameters:

Name Type Description Default
parallel_config ParallelConfig

The configurations for parallel execution.

required
ray_address Optional[str]

The address of the Ray cluster. If None, uses the default Ray cluster address.

None
Source code in vllm/executor/ray_utils.py
def initialize_ray_cluster(
    parallel_config: ParallelConfig,
    ray_address: Optional[str] = None,
):
    """Initialize the distributed cluster with Ray.

    it will connect to the Ray cluster and create a placement group
    for the workers, which includes the specification of the resources
    for each distributed worker.

    Args:
        parallel_config: The configurations for parallel execution.
        ray_address: The address of the Ray cluster. If None, uses
            the default Ray cluster address.
    """
    assert_ray_available()
    from vllm.platforms import current_platform

    if ray.is_initialized():
        logger.info("Ray is already initialized. Skipping Ray initialization.")
    elif current_platform.is_rocm() or current_platform.is_xpu():
        # Try to connect existing ray instance and create a new one if not found
        try:
            ray.init("auto")
        except ConnectionError:
            logger.warning(
                "No existing RAY instance detected. "
                "A new instance will be launched with current node resources.")
            ray.init(address=ray_address,
                     num_gpus=parallel_config.world_size,
                     runtime_env=parallel_config.ray_runtime_env)
    else:
        ray.init(address=ray_address,
                 runtime_env=parallel_config.ray_runtime_env)

    device_str = current_platform.ray_device_key
    if not device_str:
        raise ValueError(
            f"current platform {current_platform.device_name} does not "
            "support ray.")

    # Create or get the placement group for worker processes
    if parallel_config.placement_group:
        current_placement_group = parallel_config.placement_group
    else:
        current_placement_group = ray.util.get_current_placement_group()

    if current_placement_group:
        logger.info("Using the existing placement group")

        # We are in a placement group
        bundles = current_placement_group.bundle_specs
        # Verify that we can use the placement group.
        device_bundles = 0
        for bundle in bundles:
            bundle_devices = bundle.get(device_str, 0)
            if bundle_devices > 1:
                raise ValueError(
                    "Placement group bundle cannot have more than 1 "
                    f"{device_str}.")
            if bundle_devices:
                device_bundles += 1
        if parallel_config.world_size > device_bundles:
            raise ValueError(
                f"The number of required {device_str}s exceeds the total "
                f"number of available {device_str}s in the placement group. "
                f"Required number of devices: {parallel_config.world_size}. "
                f"Total number of devices: {device_bundles}.")
    else:
        logger.info("No current placement group found. "
                    "Creating a new placement group.")
        num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
        # Log a warning message and delay resource allocation failure response.
        # Avoid immediate rejection to allow user-initiated placement group
        # created and wait cluster to be ready
        if parallel_config.world_size > num_devices_in_cluster:
            logger.warning(
                "The number of required %ss exceeds the total "
                "number of available %ss in the placement group.", device_str,
                device_str)
        # Create a new placement group
        placement_group_specs: List[Dict[str, float]] = ([{
            device_str: 1.0
        } for _ in range(parallel_config.world_size)])

        # vLLM engine is also a worker to execute model with an accelerator,
        # so it requires to have the device in a current node. Check if
        # the current node has at least one device.
        current_ip = get_ip()
        current_node_id = ray.get_runtime_context().get_node_id()
        current_node_resource = available_resources_per_node()[current_node_id]
        if current_node_resource.get(device_str, 0) < 1:
            raise ValueError(
                f"Current node has no {device_str} available. "
                f"{current_node_resource=}. vLLM engine cannot start without "
                f"{device_str}. Make sure you have at least 1 {device_str} "
                f"available in a node {current_node_id=} {current_ip=}.")
        # This way, at least bundle is required to be created in a current
        # node.
        placement_group_specs[0][f"node:{current_ip}"] = 0.001

        # By default, Ray packs resources as much as possible.
        current_placement_group = ray.util.placement_group(
            placement_group_specs, strategy="PACK")
        _wait_until_pg_ready(current_placement_group)

    assert current_placement_group is not None
    _verify_bundles(current_placement_group, parallel_config, device_str)
    # Set the placement group in the parallel config
    parallel_config.placement_group = current_placement_group