vllm.compilation.backends

compilation_start_time `module-attribute` ¶

compilation_start_time = 0.0

logger `module-attribute` ¶

logger = init_logger(__name__)

model_tag `module-attribute` ¶

model_tag: str = 'backbone'

CompilerManager ¶

A manager to manage the compilation process, including caching the compiled graph, loading the compiled graph, and compiling the graph.

The cache is a dict mapping (runtime_shape, graph_index, backend_name) to any_data returned from the compiler.

When serializing the cache, we save it to a Python file for readability. We don't use json here because json doesn't support int as key.

Source code in vllm/compilation/backends.py

class CompilerManager:
    """
    A manager to manage the compilation process, including
    caching the compiled graph, loading the compiled graph,
    and compiling the graph.

    The cache is a dict mapping
    `(runtime_shape, graph_index, backend_name)`
    to `any_data` returned from the compiler.

    When serializing the cache, we save it to a Python file
    for readability. We don't use json here because json doesn't
    support int as key.
    """

    def __init__(self, compilation_config: CompilationConfig):
        self.cache: dict[tuple[Optional[int], int, str], Any] = dict()
        self.is_cache_updated = False
        self.compilation_config = compilation_config
        self.compiler = make_compiler(compilation_config)

    def compute_hash(self, vllm_config: VllmConfig) -> str:
        return self.compiler.compute_hash(vllm_config)

    def initialize_cache(self,
                         cache_dir: str,
                         disable_cache: bool = False,
                         prefix: str = ""):
        """
        Initialize the cache directory for the compiler.

        The organization of the cache directory is as follows:
        cache_dir=/path/to/hash_str/rank_i_j/prefix/
        inside cache_dir, there will be:
        - vllm_compile_cache.py
        - computation_graph.py
        - transformed_code.py

        for multiple prefixes, they can share the same
        base cache dir of /path/to/hash_str/rank_i_j/ ,
        to store some common compilation artifacts.
        """

        self.disable_cache = disable_cache
        self.cache_dir = cache_dir
        self.cache_file_path = os.path.join(cache_dir, "vllm_compile_cache.py")

        if not disable_cache and os.path.exists(self.cache_file_path):
            # load the cache from the file
            with open(self.cache_file_path) as f:
                # we use ast.literal_eval to parse the data
                # because it is a safe way to parse Python literals.
                # do not use eval(), it is unsafe.
                self.cache = ast.literal_eval(f.read())

        self.compiler.initialize_cache(cache_dir=cache_dir,
                                       disable_cache=disable_cache,
                                       prefix=prefix)

    def save_to_file(self):
        if self.disable_cache or not self.is_cache_updated:
            return
        printer = pprint.PrettyPrinter(indent=4)
        data = printer.pformat(self.cache)
        with open(self.cache_file_path, "w") as f:
            f.write(data)

    def load(self,
             graph: fx.GraphModule,
             example_inputs: list[Any],
             graph_index: int,
             runtime_shape: Optional[int] = None) -> Optional[Callable]:
        if (runtime_shape, graph_index, self.compiler.name) not in self.cache:
            return None
        handle = self.cache[(runtime_shape, graph_index, self.compiler.name)]
        compiled_graph = self.compiler.load(handle, graph, example_inputs,
                                            graph_index, runtime_shape)
        if runtime_shape is None:
            logger.debug(
                "Directly load the %s-th graph for dynamic shape from %s via "
                "handle %s", graph_index, self.compiler.name, handle)
        else:
            logger.debug(
                "Directly load the %s-th graph for shape %s from %s via "
                "handle %s", graph_index, str(runtime_shape),
                self.compiler.name, handle)
        return compiled_graph

    def compile(self,
                graph: fx.GraphModule,
                example_inputs,
                additional_inductor_config,
                compilation_config: CompilationConfig,
                graph_index: int = 0,
                num_graphs: int = 1,
                runtime_shape: Optional[int] = None) -> Any:
        if graph_index == 0:
            # before compiling the first graph, record the start time
            global compilation_start_time
            compilation_start_time = time.time()

        compilation_counter.num_backend_compilations += 1

        compiled_graph = None

        # try to load from the cache
        compiled_graph = self.load(graph, example_inputs, graph_index,
                                   runtime_shape)
        if compiled_graph is not None:
            if graph_index == num_graphs - 1:
                # after loading the last graph for this shape, record the time.
                # there can be multiple graphs due to piecewise compilation.
                now = time.time()
                elapsed = now - compilation_start_time
                if runtime_shape is None:
                    logger.info(
                        "Directly load the compiled graph(s) for dynamic shape "
                        "from the cache, took %.3f s", elapsed)
                else:
                    logger.info(
                        "Directly load the compiled graph(s) for shape %s "
                        "from the cache, took %.3f s", str(runtime_shape),
                        elapsed)
            return compiled_graph

        # no compiler cached the graph, or the cache is disabled,
        # we need to compile it
        if isinstance(self.compiler, InductorAdaptor):
            # Let compile_fx generate a key for us
            maybe_key = None
        else:
            maybe_key = \
                f"artifact_shape_{runtime_shape}_subgraph_{graph_index}"
        compiled_graph, handle = self.compiler.compile(
            graph, example_inputs, additional_inductor_config, runtime_shape,
            maybe_key)

        assert compiled_graph is not None, "Failed to compile the graph"

        # store the artifact in the cache
        if not envs.VLLM_DISABLE_COMPILE_CACHE and handle is not None:
            self.cache[(runtime_shape, graph_index,
                        self.compiler.name)] = handle
            compilation_counter.num_cache_entries_updated += 1
            self.is_cache_updated = True
            if graph_index == 0:
                # adds some info logging for the first graph
                if runtime_shape is None:
                    logger.info(
                        "Cache the graph for dynamic shape for later use")
                else:
                    logger.info("Cache the graph of shape %s for later use",
                                str(runtime_shape))
            if runtime_shape is None:
                logger.debug(
                    "Store the %s-th graph for dynamic shape from %s via "
                    "handle %s", graph_index, self.compiler.name, handle)
            else:
                logger.debug(
                    "Store the %s-th graph for shape %s from %s via handle %s",
                    graph_index, str(runtime_shape), self.compiler.name,
                    handle)

        # after compiling the last graph, record the end time
        if graph_index == num_graphs - 1:
            now = time.time()
            elapsed = now - compilation_start_time
            compilation_config.compilation_time += elapsed
            if runtime_shape is None:
                logger.info("Compiling a graph for dynamic shape takes %.2f s",
                            elapsed)
            else:
                logger.info("Compiling a graph for shape %s takes %.2f s",
                            runtime_shape, elapsed)

        return compiled_graph

cache `instance-attribute` ¶

cache: dict[tuple[Optional[int], int, str], Any] = dict()

compilation_config `instance-attribute` ¶

compilation_config = compilation_config

compiler `instance-attribute` ¶

compiler = make_compiler(compilation_config)

is_cache_updated `instance-attribute` ¶

is_cache_updated = False

init ¶

__init__(compilation_config: CompilationConfig)

Source code in vllm/compilation/backends.py

def __init__(self, compilation_config: CompilationConfig):
    self.cache: dict[tuple[Optional[int], int, str], Any] = dict()
    self.is_cache_updated = False
    self.compilation_config = compilation_config
    self.compiler = make_compiler(compilation_config)

compile ¶

compile(
    graph: GraphModule,
    example_inputs,
    additional_inductor_config,
    compilation_config: CompilationConfig,
    graph_index: int = 0,
    num_graphs: int = 1,
    runtime_shape: Optional[int] = None,
) -> Any

Source code in vllm/compilation/backends.py

def compile(self,
            graph: fx.GraphModule,
            example_inputs,
            additional_inductor_config,
            compilation_config: CompilationConfig,
            graph_index: int = 0,
            num_graphs: int = 1,
            runtime_shape: Optional[int] = None) -> Any:
    if graph_index == 0:
        # before compiling the first graph, record the start time
        global compilation_start_time
        compilation_start_time = time.time()

    compilation_counter.num_backend_compilations += 1

    compiled_graph = None

    # try to load from the cache
    compiled_graph = self.load(graph, example_inputs, graph_index,
                               runtime_shape)
    if compiled_graph is not None:
        if graph_index == num_graphs - 1:
            # after loading the last graph for this shape, record the time.
            # there can be multiple graphs due to piecewise compilation.
            now = time.time()
            elapsed = now - compilation_start_time
            if runtime_shape is None:
                logger.info(
                    "Directly load the compiled graph(s) for dynamic shape "
                    "from the cache, took %.3f s", elapsed)
            else:
                logger.info(
                    "Directly load the compiled graph(s) for shape %s "
                    "from the cache, took %.3f s", str(runtime_shape),
                    elapsed)
        return compiled_graph

    # no compiler cached the graph, or the cache is disabled,
    # we need to compile it
    if isinstance(self.compiler, InductorAdaptor):
        # Let compile_fx generate a key for us
        maybe_key = None
    else:
        maybe_key = \
            f"artifact_shape_{runtime_shape}_subgraph_{graph_index}"
    compiled_graph, handle = self.compiler.compile(
        graph, example_inputs, additional_inductor_config, runtime_shape,
        maybe_key)

    assert compiled_graph is not None, "Failed to compile the graph"

    # store the artifact in the cache
    if not envs.VLLM_DISABLE_COMPILE_CACHE and handle is not None:
        self.cache[(runtime_shape, graph_index,
                    self.compiler.name)] = handle
        compilation_counter.num_cache_entries_updated += 1
        self.is_cache_updated = True
        if graph_index == 0:
            # adds some info logging for the first graph
            if runtime_shape is None:
                logger.info(
                    "Cache the graph for dynamic shape for later use")
            else:
                logger.info("Cache the graph of shape %s for later use",
                            str(runtime_shape))
        if runtime_shape is None:
            logger.debug(
                "Store the %s-th graph for dynamic shape from %s via "
                "handle %s", graph_index, self.compiler.name, handle)
        else:
            logger.debug(
                "Store the %s-th graph for shape %s from %s via handle %s",
                graph_index, str(runtime_shape), self.compiler.name,
                handle)

    # after compiling the last graph, record the end time
    if graph_index == num_graphs - 1:
        now = time.time()
        elapsed = now - compilation_start_time
        compilation_config.compilation_time += elapsed
        if runtime_shape is None:
            logger.info("Compiling a graph for dynamic shape takes %.2f s",
                        elapsed)
        else:
            logger.info("Compiling a graph for shape %s takes %.2f s",
                        runtime_shape, elapsed)

    return compiled_graph

compute_hash ¶

compute_hash(vllm_config: VllmConfig) -> str

Source code in vllm/compilation/backends.py

def compute_hash(self, vllm_config: VllmConfig) -> str:
    return self.compiler.compute_hash(vllm_config)

initialize_cache ¶

initialize_cache(
    cache_dir: str,
    disable_cache: bool = False,
    prefix: str = "",
)

Initialize the cache directory for the compiler.

The organization of the cache directory is as follows: cache_dir=/path/to/hash_str/rank_i_j/prefix/ inside cache_dir, there will be: - vllm_compile_cache.py - computation_graph.py - transformed_code.py

for multiple prefixes, they can share the same base cache dir of /path/to/hash_str/rank_i_j/ , to store some common compilation artifacts.

Source code in vllm/compilation/backends.py

def initialize_cache(self,
                     cache_dir: str,
                     disable_cache: bool = False,
                     prefix: str = ""):
    """
    Initialize the cache directory for the compiler.

    The organization of the cache directory is as follows:
    cache_dir=/path/to/hash_str/rank_i_j/prefix/
    inside cache_dir, there will be:
    - vllm_compile_cache.py
    - computation_graph.py
    - transformed_code.py

    for multiple prefixes, they can share the same
    base cache dir of /path/to/hash_str/rank_i_j/ ,
    to store some common compilation artifacts.
    """

    self.disable_cache = disable_cache
    self.cache_dir = cache_dir
    self.cache_file_path = os.path.join(cache_dir, "vllm_compile_cache.py")

    if not disable_cache and os.path.exists(self.cache_file_path):
        # load the cache from the file
        with open(self.cache_file_path) as f:
            # we use ast.literal_eval to parse the data
            # because it is a safe way to parse Python literals.
            # do not use eval(), it is unsafe.
            self.cache = ast.literal_eval(f.read())

    self.compiler.initialize_cache(cache_dir=cache_dir,
                                   disable_cache=disable_cache,
                                   prefix=prefix)

load ¶

load(
    graph: GraphModule,
    example_inputs: list[Any],
    graph_index: int,
    runtime_shape: Optional[int] = None,
) -> Optional[Callable]

Source code in vllm/compilation/backends.py

def load(self,
         graph: fx.GraphModule,
         example_inputs: list[Any],
         graph_index: int,
         runtime_shape: Optional[int] = None) -> Optional[Callable]:
    if (runtime_shape, graph_index, self.compiler.name) not in self.cache:
        return None
    handle = self.cache[(runtime_shape, graph_index, self.compiler.name)]
    compiled_graph = self.compiler.load(handle, graph, example_inputs,
                                        graph_index, runtime_shape)
    if runtime_shape is None:
        logger.debug(
            "Directly load the %s-th graph for dynamic shape from %s via "
            "handle %s", graph_index, self.compiler.name, handle)
    else:
        logger.debug(
            "Directly load the %s-th graph for shape %s from %s via "
            "handle %s", graph_index, str(runtime_shape),
            self.compiler.name, handle)
    return compiled_graph

save_to_file ¶

save_to_file()

Source code in vllm/compilation/backends.py

def save_to_file(self):
    if self.disable_cache or not self.is_cache_updated:
        return
    printer = pprint.PrettyPrinter(indent=4)
    data = printer.pformat(self.cache)
    with open(self.cache_file_path, "w") as f:
        f.write(data)

PiecewiseCompileInterpreter ¶

Bases: Interpreter

Code adapted from torch.fx.passes.shape_prop.ShapeProp. It runs the given graph with fake inputs, and compile some submodules specified by compile_submod_names with the given compilation configs.

NOTE: the order in compile_submod_names matters, because it will be used to determine the order of the compiled piecewise graphs. The first graph will handle logging, and the last graph has some special cudagraph output handling.

Source code in vllm/compilation/backends.py

class PiecewiseCompileInterpreter(torch.fx.Interpreter):
    """Code adapted from `torch.fx.passes.shape_prop.ShapeProp`.
    It runs the given graph with fake inputs, and compile some
    submodules specified by `compile_submod_names` with the given
    compilation configs.

    NOTE: the order in `compile_submod_names` matters, because
    it will be used to determine the order of the compiled piecewise
    graphs. The first graph will handle logging, and the last graph
    has some special cudagraph output handling.
    """

    def __init__(self, module: torch.fx.GraphModule,
                 compile_submod_names: list[str], vllm_config: VllmConfig,
                 vllm_backend: "VllmBackend"):
        super().__init__(module)
        from torch._guards import detect_fake_mode
        self.fake_mode = detect_fake_mode()
        self.compile_submod_names = compile_submod_names
        self.compilation_config = vllm_config.compilation_config
        self.vllm_config = vllm_config
        self.vllm_backend = vllm_backend
        # When True, it annoyingly dumps the torch.fx.Graph on errors.
        self.extra_traceback = False

    def run(self, *args):
        fake_args = [
            self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
            for t in args
        ]
        with self.fake_mode, enable_python_dispatcher():
            return super().run(*fake_args)

    def call_module(self, target: torch.fx.node.Target,
                    args: tuple[torch.fx.node.Argument,
                                ...], kwargs: dict[str, Any]) -> Any:
        assert isinstance(target, str)
        output = super().call_module(target, args, kwargs)

        if target in self.compile_submod_names:
            index = self.compile_submod_names.index(target)
            submod = self.fetch_attr(target)
            sym_shape_indices = [
                i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
            ]
            global compilation_start_time
            compiled_graph_for_dynamic_shape = self.vllm_backend.\
                compiler_manager.compile(
                submod,
                args,
                self.compilation_config.inductor_compile_config,
                self.compilation_config,
                graph_index=index,
                num_graphs=len(self.compile_submod_names),
                runtime_shape=None)
            # Lazy import here to avoid circular import
            from .cuda_graph import CUDAGraphOptions
            from .cuda_piecewise_backend import PiecewiseBackend

            piecewise_backend = PiecewiseBackend(
                submod, self.vllm_config, index,
                len(self.compile_submod_names), sym_shape_indices,
                compiled_graph_for_dynamic_shape, self.vllm_backend)

            if self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE:
                # resolve the static graph wrapper class (e.g. CUDAGraphWrapper
                # class) as platform dependent.
                static_graph_wrapper_class = resolve_obj_by_qualname(
                    current_platform.get_static_graph_wrapper_cls())

                # Always assign PIECEWISE runtime mode to the
                # CUDAGraphWrapper for piecewise_backend, to distinguish
                # it from the FULL cudagraph runtime mode, no matter it
                # is wrapped on a full or piecewise fx graph.
                self.module.__dict__[target] = static_graph_wrapper_class(
                    runnable=piecewise_backend,
                    vllm_config=self.vllm_config,
                    runtime_mode=CUDAGraphMode.PIECEWISE,
                    cudagraph_options=CUDAGraphOptions(
                        debug_log_enable=piecewise_backend.is_first_graph,
                        gc_disable=not piecewise_backend.is_first_graph,
                        weak_ref_output=piecewise_backend.is_last_graph))
            else:
                self.module.__dict__[target] = piecewise_backend

            compilation_counter.num_piecewise_capturable_graphs_seen += 1

        return output

compilation_config `instance-attribute` ¶

compilation_config = compilation_config

compile_submod_names `instance-attribute` ¶

compile_submod_names = compile_submod_names

extra_traceback `instance-attribute` ¶

extra_traceback = False

fake_mode `instance-attribute` ¶

fake_mode = detect_fake_mode()

vllm_backend `instance-attribute` ¶

vllm_backend = vllm_backend

vllm_config `instance-attribute` ¶

vllm_config = vllm_config

init ¶

__init__(
    module: GraphModule,
    compile_submod_names: list[str],
    vllm_config: VllmConfig,
    vllm_backend: VllmBackend,
)

Source code in vllm/compilation/backends.py

def __init__(self, module: torch.fx.GraphModule,
             compile_submod_names: list[str], vllm_config: VllmConfig,
             vllm_backend: "VllmBackend"):
    super().__init__(module)
    from torch._guards import detect_fake_mode
    self.fake_mode = detect_fake_mode()
    self.compile_submod_names = compile_submod_names
    self.compilation_config = vllm_config.compilation_config
    self.vllm_config = vllm_config
    self.vllm_backend = vllm_backend
    # When True, it annoyingly dumps the torch.fx.Graph on errors.
    self.extra_traceback = False

call_module ¶

call_module(
    target: Target,
    args: tuple[Argument, ...],
    kwargs: dict[str, Any],
) -> Any

Source code in vllm/compilation/backends.py

def call_module(self, target: torch.fx.node.Target,
                args: tuple[torch.fx.node.Argument,
                            ...], kwargs: dict[str, Any]) -> Any:
    assert isinstance(target, str)
    output = super().call_module(target, args, kwargs)

    if target in self.compile_submod_names:
        index = self.compile_submod_names.index(target)
        submod = self.fetch_attr(target)
        sym_shape_indices = [
            i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
        ]
        global compilation_start_time
        compiled_graph_for_dynamic_shape = self.vllm_backend.\
            compiler_manager.compile(
            submod,
            args,
            self.compilation_config.inductor_compile_config,
            self.compilation_config,
            graph_index=index,
            num_graphs=len(self.compile_submod_names),
            runtime_shape=None)
        # Lazy import here to avoid circular import
        from .cuda_graph import CUDAGraphOptions
        from .cuda_piecewise_backend import PiecewiseBackend

        piecewise_backend = PiecewiseBackend(
            submod, self.vllm_config, index,
            len(self.compile_submod_names), sym_shape_indices,
            compiled_graph_for_dynamic_shape, self.vllm_backend)

        if self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE:
            # resolve the static graph wrapper class (e.g. CUDAGraphWrapper
            # class) as platform dependent.
            static_graph_wrapper_class = resolve_obj_by_qualname(
                current_platform.get_static_graph_wrapper_cls())

            # Always assign PIECEWISE runtime mode to the
            # CUDAGraphWrapper for piecewise_backend, to distinguish
            # it from the FULL cudagraph runtime mode, no matter it
            # is wrapped on a full or piecewise fx graph.
            self.module.__dict__[target] = static_graph_wrapper_class(
                runnable=piecewise_backend,
                vllm_config=self.vllm_config,
                runtime_mode=CUDAGraphMode.PIECEWISE,
                cudagraph_options=CUDAGraphOptions(
                    debug_log_enable=piecewise_backend.is_first_graph,
                    gc_disable=not piecewise_backend.is_first_graph,
                    weak_ref_output=piecewise_backend.is_last_graph))
        else:
            self.module.__dict__[target] = piecewise_backend

        compilation_counter.num_piecewise_capturable_graphs_seen += 1

    return output

run ¶

run(*args)

Source code in vllm/compilation/backends.py

def run(self, *args):
    fake_args = [
        self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
        for t in args
    ]
    with self.fake_mode, enable_python_dispatcher():
        return super().run(*fake_args)

SplitItem `dataclass` ¶

Source code in vllm/compilation/backends.py

@dataclasses.dataclass
class SplitItem:
    submod_name: str
    graph_id: int
    is_splitting_graph: bool
    graph: fx.GraphModule

graph `instance-attribute` ¶

graph: GraphModule

graph_id `instance-attribute` ¶

graph_id: int

is_splitting_graph `instance-attribute` ¶

is_splitting_graph: bool

submod_name `instance-attribute` ¶

submod_name: str

init ¶

__init__(
    submod_name: str,
    graph_id: int,
    is_splitting_graph: bool,
    graph: GraphModule,
) -> None

VllmBackend ¶

The compilation backend for torch.compile with vLLM. It is used for compilation level of CompilationLevel.PIECEWISE, where we customize the compilation.

The major work of this backend is to split the graph into piecewise graphs, and pass them to the piecewise backend.

This backend also adds the PostGradPassManager to Inductor config, which handles the post-grad passes.

Source code in vllm/compilation/backends.py

class VllmBackend:
    """The compilation backend for `torch.compile` with vLLM.
    It is used for compilation level of `CompilationLevel.PIECEWISE`,
    where we customize the compilation.

    The major work of this backend is to split the graph into
    piecewise graphs, and pass them to the piecewise backend.

    This backend also adds the PostGradPassManager to Inductor config,
    which handles the post-grad passes.
    """

    vllm_config: VllmConfig
    compilation_config: CompilationConfig
    _called: bool = False
    # the graph we compiled
    graph: fx.GraphModule
    # the stiching graph module for all the piecewise graphs
    split_gm: fx.GraphModule
    piecewise_graphs: list[SplitItem]
    returned_callable: Callable
    # Inductor passes to run on the graph pre-defunctionalization
    post_grad_passes: Sequence[Callable]
    sym_tensor_indices: list[int]
    input_buffers: list[torch.Tensor]
    compiler_manager: CompilerManager

    def __init__(
        self,
        vllm_config: VllmConfig,
        prefix: str = "",
    ):

        # if the model is initialized with a non-empty prefix,
        # then usually it's enough to use that prefix,
        # e.g. launguage_model, vision_model, etc.
        # when multiple parts are initialized as independent
        # models, we need to use the model_tag to distinguish
        # them, e.g. backbone (default), eagle_head, etc.
        self.prefix = prefix or model_tag

        # Passes to run on the graph post-grad.
        self.post_grad_pass_manager = PostGradPassManager()

        self.sym_tensor_indices = []
        self.input_buffers = []

        self.vllm_config = vllm_config
        self.compilation_config = vllm_config.compilation_config

        self.compiler_manager: CompilerManager = CompilerManager(
            self.compilation_config)

        # `torch.compile` is JIT compiled, so we don't need to
        # do anything here

    def configure_post_pass(self):
        config = self.compilation_config
        self.post_grad_pass_manager.configure(self.vllm_config)

        # Post-grad custom passes are run using the post_grad_custom_post_pass
        # hook. If a pass for that hook exists, add it to the pass manager.
        inductor_config = config.inductor_compile_config
        PASS_KEY = "post_grad_custom_post_pass"
        if PASS_KEY in inductor_config:
            # Config should automatically wrap all inductor passes
            if isinstance(inductor_config[PASS_KEY], PostGradPassManager):
                assert (inductor_config[PASS_KEY].uuid() ==
                        self.post_grad_pass_manager.uuid())
            else:
                assert isinstance(inductor_config[PASS_KEY], InductorPass)
                self.post_grad_pass_manager.add(inductor_config[PASS_KEY])
        inductor_config[PASS_KEY] = self.post_grad_pass_manager

    def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:

        vllm_config = self.vllm_config
        if not self.compilation_config.cache_dir:
            # no provided cache dir, generate one based on the known factors
            # that affects the compilation. if none of the factors change,
            # the cache dir will be the same so that we can reuse the compiled
            # graph.

            factors = []
            # 0. factors come from the env, for example, The values of
            # VLLM_PP_LAYER_PARTITION will affect the computation graph.
            env_hash = envs.compute_hash()
            factors.append(env_hash)

            # 1. factors come from the vllm_config (it mainly summarizes how the
            #    model is created)
            config_hash = vllm_config.compute_hash()
            factors.append(config_hash)

            # 2. factors come from the code files that are traced by Dynamo (
            #    it mainly summarizes how the model is used in forward pass)
            forward_code_files = list(
                sorted(self.compilation_config.traced_files))
            self.compilation_config.traced_files.clear()
            logger.debug(
                "Traced files (to be considered for compilation cache):\n%s",
                "\n".join(forward_code_files))
            hash_content = []
            for filepath in forward_code_files:
                hash_content.append(filepath)
                if filepath == "<string>":
                    # This means the function was dynamically generated, with
                    # e.g. exec(). We can't actually check these.
                    continue
                with open(filepath) as f:
                    hash_content.append(f.read())
            import hashlib
            code_hash = hashlib.md5("\n".join(hash_content).encode(),
                                    usedforsecurity=False).hexdigest()
            factors.append(code_hash)

            # 3. compiler hash
            compiler_hash = self.compiler_manager.compute_hash(vllm_config)
            factors.append(compiler_hash)

            # combine all factors to generate the cache dir
            hash_key = hashlib.md5(str(factors).encode(),
                                   usedforsecurity=False).hexdigest()[:10]

            cache_dir = os.path.join(
                envs.VLLM_CACHE_ROOT,
                "torch_compile_cache",
                hash_key,
            )
            self.compilation_config.cache_dir = cache_dir

        cache_dir = self.compilation_config.cache_dir
        os.makedirs(cache_dir, exist_ok=True)
        self.compilation_config.cache_dir = cache_dir
        rank = vllm_config.parallel_config.rank
        dp_rank = vllm_config.parallel_config.data_parallel_rank
        local_cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}",
                                       self.prefix)
        os.makedirs(local_cache_dir, exist_ok=True)
        self.compilation_config.local_cache_dir = local_cache_dir

        disable_cache = envs.VLLM_DISABLE_COMPILE_CACHE

        if disable_cache:
            logger.info("vLLM's torch.compile cache is disabled.")
        else:
            logger.info("Using cache directory: %s for vLLM's torch.compile",
                        local_cache_dir)

        self.compiler_manager.initialize_cache(local_cache_dir, disable_cache,
                                               self.prefix)

        # when dynamo calls the backend, it means the bytecode
        # transform and analysis are done
        compilation_counter.num_graphs_seen += 1
        from .monitor import torch_compile_start_time
        dynamo_time = time.time() - torch_compile_start_time
        logger.info("Dynamo bytecode transform time: %.2f s", dynamo_time)
        self.compilation_config.compilation_time += dynamo_time

        # we control the compilation process, each instance can only be
        # called once
        assert not self._called, "VllmBackend can only be called once"

        self.graph = graph
        self.configure_post_pass()

        self.split_gm, self.piecewise_graphs = split_graph(
            graph, self.compilation_config.splitting_ops)

        from torch._dynamo.utils import lazy_format_graph_code

        # depyf will hook lazy_format_graph_code and dump the graph
        # for debugging, no need to print the graph here
        lazy_format_graph_code("before split", self.graph)
        lazy_format_graph_code("after split", self.split_gm)

        compilation_counter.num_piecewise_graphs_seen += len(
            self.piecewise_graphs)
        submod_names_to_compile = [
            item.submod_name for item in self.piecewise_graphs
            if not item.is_splitting_graph
        ]

        # propagate the split graph to the piecewise backend,
        # compile submodules with symbolic shapes
        PiecewiseCompileInterpreter(self.split_gm, submod_names_to_compile,
                                    self.vllm_config,
                                    self).run(*example_inputs)

        graph_path = os.path.join(local_cache_dir, "computation_graph.py")
        if not os.path.exists(graph_path):
            # code adapted from https://github.com/thuml/depyf/blob/dab831108a752d1facc00acdd6d4243891845c37/depyf/explain/patched_lazy_format_graph_code.py#L30 # noqa
            # use `print_readable` because it can include submodules
            src = "from __future__ import annotations\nimport torch\n" + \
                self.split_gm.print_readable(print_output=False)
            src = src.replace("<lambda>", "GraphModule")
            with open(graph_path, "w") as f:
                f.write(src)

            logger.debug("Computation graph saved to %s", graph_path)

        self._called = True

        if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE or \
            not self.compilation_config.cudagraph_copy_inputs:
            return self.split_gm

        # if we need to copy input buffers for cudagraph
        from torch._guards import detect_fake_mode
        fake_mode = detect_fake_mode()
        fake_args = [
            fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
            for t in example_inputs
        ]

        # index of tensors that have symbolic shapes (batch size)
        # for weights and static buffers, they will have concrete shapes.
        # symbolic shape only happens for input tensors.
        from torch.fx.experimental.symbolic_shapes import is_symbolic
        self.sym_tensor_indices = [
            i for i, x in enumerate(fake_args)
            if isinstance(x, torch._subclasses.fake_tensor.FakeTensor) and \
                any(is_symbolic(d) for d in x.size())
        ]

        # compiler managed cudagraph input buffers
        # we assume the first run with symbolic shapes
        # has the maximum size among all the tensors
        self.input_buffers = [
            example_inputs[x].clone() for x in self.sym_tensor_indices
        ]

        # this is the callable we return to Dynamo to run
        def copy_and_call(*args):
            list_args = list(args)
            for i, index in enumerate(self.sym_tensor_indices):
                runtime_tensor = list_args[index]
                runtime_shape = runtime_tensor.shape[0]
                static_tensor = self.input_buffers[i][:runtime_shape]

                # copy the tensor to the static buffer
                static_tensor.copy_(runtime_tensor)

                # replace the tensor in the list_args to the static buffer
                list_args[index] = static_tensor
            return self.split_gm(*list_args)

        return copy_and_call

_called `class-attribute` `instance-attribute` ¶

_called: bool = False

compilation_config `instance-attribute` ¶

compilation_config: CompilationConfig = compilation_config

compiler_manager `instance-attribute` ¶

compiler_manager: CompilerManager = CompilerManager(
    compilation_config
)

graph `instance-attribute` ¶

graph: GraphModule

input_buffers `instance-attribute` ¶

input_buffers: list[Tensor] = []

piecewise_graphs `instance-attribute` ¶

piecewise_graphs: list[SplitItem]

post_grad_pass_manager `instance-attribute` ¶

post_grad_pass_manager = PostGradPassManager()

post_grad_passes `instance-attribute` ¶

post_grad_passes: Sequence[Callable]

prefix `instance-attribute` ¶

prefix = prefix or model_tag

returned_callable `instance-attribute` ¶

returned_callable: Callable

split_gm `instance-attribute` ¶

split_gm: GraphModule

sym_tensor_indices `instance-attribute` ¶

sym_tensor_indices: list[int] = []

vllm_config `instance-attribute` ¶

vllm_config: VllmConfig = vllm_config

call ¶

__call__(graph: GraphModule, example_inputs) -> Callable

Source code in vllm/compilation/backends.py

def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:

    vllm_config = self.vllm_config
    if not self.compilation_config.cache_dir:
        # no provided cache dir, generate one based on the known factors
        # that affects the compilation. if none of the factors change,
        # the cache dir will be the same so that we can reuse the compiled
        # graph.

        factors = []
        # 0. factors come from the env, for example, The values of
        # VLLM_PP_LAYER_PARTITION will affect the computation graph.
        env_hash = envs.compute_hash()
        factors.append(env_hash)

        # 1. factors come from the vllm_config (it mainly summarizes how the
        #    model is created)
        config_hash = vllm_config.compute_hash()
        factors.append(config_hash)

        # 2. factors come from the code files that are traced by Dynamo (
        #    it mainly summarizes how the model is used in forward pass)
        forward_code_files = list(
            sorted(self.compilation_config.traced_files))
        self.compilation_config.traced_files.clear()
        logger.debug(
            "Traced files (to be considered for compilation cache):\n%s",
            "\n".join(forward_code_files))
        hash_content = []
        for filepath in forward_code_files:
            hash_content.append(filepath)
            if filepath == "<string>":
                # This means the function was dynamically generated, with
                # e.g. exec(). We can't actually check these.
                continue
            with open(filepath) as f:
                hash_content.append(f.read())
        import hashlib
        code_hash = hashlib.md5("\n".join(hash_content).encode(),
                                usedforsecurity=False).hexdigest()
        factors.append(code_hash)

        # 3. compiler hash
        compiler_hash = self.compiler_manager.compute_hash(vllm_config)
        factors.append(compiler_hash)

        # combine all factors to generate the cache dir
        hash_key = hashlib.md5(str(factors).encode(),
                               usedforsecurity=False).hexdigest()[:10]

        cache_dir = os.path.join(
            envs.VLLM_CACHE_ROOT,
            "torch_compile_cache",
            hash_key,
        )
        self.compilation_config.cache_dir = cache_dir

    cache_dir = self.compilation_config.cache_dir
    os.makedirs(cache_dir, exist_ok=True)
    self.compilation_config.cache_dir = cache_dir
    rank = vllm_config.parallel_config.rank
    dp_rank = vllm_config.parallel_config.data_parallel_rank
    local_cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}",
                                   self.prefix)
    os.makedirs(local_cache_dir, exist_ok=True)
    self.compilation_config.local_cache_dir = local_cache_dir

    disable_cache = envs.VLLM_DISABLE_COMPILE_CACHE

    if disable_cache:
        logger.info("vLLM's torch.compile cache is disabled.")
    else:
        logger.info("Using cache directory: %s for vLLM's torch.compile",
                    local_cache_dir)

    self.compiler_manager.initialize_cache(local_cache_dir, disable_cache,
                                           self.prefix)

    # when dynamo calls the backend, it means the bytecode
    # transform and analysis are done
    compilation_counter.num_graphs_seen += 1
    from .monitor import torch_compile_start_time
    dynamo_time = time.time() - torch_compile_start_time
    logger.info("Dynamo bytecode transform time: %.2f s", dynamo_time)
    self.compilation_config.compilation_time += dynamo_time

    # we control the compilation process, each instance can only be
    # called once
    assert not self._called, "VllmBackend can only be called once"

    self.graph = graph
    self.configure_post_pass()

    self.split_gm, self.piecewise_graphs = split_graph(
        graph, self.compilation_config.splitting_ops)

    from torch._dynamo.utils import lazy_format_graph_code

    # depyf will hook lazy_format_graph_code and dump the graph
    # for debugging, no need to print the graph here
    lazy_format_graph_code("before split", self.graph)
    lazy_format_graph_code("after split", self.split_gm)

    compilation_counter.num_piecewise_graphs_seen += len(
        self.piecewise_graphs)
    submod_names_to_compile = [
        item.submod_name for item in self.piecewise_graphs
        if not item.is_splitting_graph
    ]

    # propagate the split graph to the piecewise backend,
    # compile submodules with symbolic shapes
    PiecewiseCompileInterpreter(self.split_gm, submod_names_to_compile,
                                self.vllm_config,
                                self).run(*example_inputs)

    graph_path = os.path.join(local_cache_dir, "computation_graph.py")
    if not os.path.exists(graph_path):
        # code adapted from https://github.com/thuml/depyf/blob/dab831108a752d1facc00acdd6d4243891845c37/depyf/explain/patched_lazy_format_graph_code.py#L30 # noqa
        # use `print_readable` because it can include submodules
        src = "from __future__ import annotations\nimport torch\n" + \
            self.split_gm.print_readable(print_output=False)
        src = src.replace("<lambda>", "GraphModule")
        with open(graph_path, "w") as f:
            f.write(src)

        logger.debug("Computation graph saved to %s", graph_path)

    self._called = True

    if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE or \
        not self.compilation_config.cudagraph_copy_inputs:
        return self.split_gm

    # if we need to copy input buffers for cudagraph
    from torch._guards import detect_fake_mode
    fake_mode = detect_fake_mode()
    fake_args = [
        fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
        for t in example_inputs
    ]

    # index of tensors that have symbolic shapes (batch size)
    # for weights and static buffers, they will have concrete shapes.
    # symbolic shape only happens for input tensors.
    from torch.fx.experimental.symbolic_shapes import is_symbolic
    self.sym_tensor_indices = [
        i for i, x in enumerate(fake_args)
        if isinstance(x, torch._subclasses.fake_tensor.FakeTensor) and \
            any(is_symbolic(d) for d in x.size())
    ]

    # compiler managed cudagraph input buffers
    # we assume the first run with symbolic shapes
    # has the maximum size among all the tensors
    self.input_buffers = [
        example_inputs[x].clone() for x in self.sym_tensor_indices
    ]

    # this is the callable we return to Dynamo to run
    def copy_and_call(*args):
        list_args = list(args)
        for i, index in enumerate(self.sym_tensor_indices):
            runtime_tensor = list_args[index]
            runtime_shape = runtime_tensor.shape[0]
            static_tensor = self.input_buffers[i][:runtime_shape]

            # copy the tensor to the static buffer
            static_tensor.copy_(runtime_tensor)

            # replace the tensor in the list_args to the static buffer
            list_args[index] = static_tensor
        return self.split_gm(*list_args)

    return copy_and_call

init ¶

__init__(vllm_config: VllmConfig, prefix: str = '')

Source code in vllm/compilation/backends.py

def __init__(
    self,
    vllm_config: VllmConfig,
    prefix: str = "",
):

    # if the model is initialized with a non-empty prefix,
    # then usually it's enough to use that prefix,
    # e.g. launguage_model, vision_model, etc.
    # when multiple parts are initialized as independent
    # models, we need to use the model_tag to distinguish
    # them, e.g. backbone (default), eagle_head, etc.
    self.prefix = prefix or model_tag

    # Passes to run on the graph post-grad.
    self.post_grad_pass_manager = PostGradPassManager()

    self.sym_tensor_indices = []
    self.input_buffers = []

    self.vllm_config = vllm_config
    self.compilation_config = vllm_config.compilation_config

    self.compiler_manager: CompilerManager = CompilerManager(
        self.compilation_config)

configure_post_pass ¶

configure_post_pass()

Source code in vllm/compilation/backends.py

def configure_post_pass(self):
    config = self.compilation_config
    self.post_grad_pass_manager.configure(self.vllm_config)

    # Post-grad custom passes are run using the post_grad_custom_post_pass
    # hook. If a pass for that hook exists, add it to the pass manager.
    inductor_config = config.inductor_compile_config
    PASS_KEY = "post_grad_custom_post_pass"
    if PASS_KEY in inductor_config:
        # Config should automatically wrap all inductor passes
        if isinstance(inductor_config[PASS_KEY], PostGradPassManager):
            assert (inductor_config[PASS_KEY].uuid() ==
                    self.post_grad_pass_manager.uuid())
        else:
            assert isinstance(inductor_config[PASS_KEY], InductorPass)
            self.post_grad_pass_manager.add(inductor_config[PASS_KEY])
    inductor_config[PASS_KEY] = self.post_grad_pass_manager

make_compiler ¶

make_compiler(
    compilation_config: CompilationConfig,
) -> CompilerInterface

Source code in vllm/compilation/backends.py

def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
    if compilation_config.use_inductor:
        if envs.VLLM_USE_STANDALONE_COMPILE and is_torch_equal_or_newer(
                "2.8.0.dev"):
            logger.debug("Using InductorStandaloneAdaptor")
            return InductorStandaloneAdaptor()
        else:
            logger.debug("Using InductorAdaptor")
            return InductorAdaptor()
    else:
        logger.debug("Using EagerAdaptor")
        return EagerAdaptor()

set_model_tag ¶

set_model_tag(tag: str)

Context manager to set the model tag.

Source code in vllm/compilation/backends.py

@contextmanager
def set_model_tag(tag: str):
    """Context manager to set the model tag."""
    global model_tag
    assert tag != model_tag, \
        f"Model tag {tag} is the same as the current tag {model_tag}."
    old_tag = model_tag
    model_tag = tag
    try:
        yield
    finally:
        model_tag = old_tag

split_graph ¶

split_graph(
    graph: GraphModule, ops: list[str]
) -> tuple[GraphModule, list[SplitItem]]

Source code in vllm/compilation/backends.py

def split_graph(graph: fx.GraphModule,
                ops: list[str]) -> tuple[fx.GraphModule, list[SplitItem]]:
    # split graph by ops
    subgraph_id = 0
    node_to_subgraph_id = {}
    split_op_graphs = []
    for node in graph.graph.nodes:
        if node.op in ("output", "placeholder"):
            continue
        if node.op == 'call_function' and str(node.target) in ops:
            subgraph_id += 1
            node_to_subgraph_id[node] = subgraph_id
            split_op_graphs.append(subgraph_id)
            subgraph_id += 1
        else:
            node_to_subgraph_id[node] = subgraph_id

    # `keep_original_order` is important!
    # otherwise pytorch might reorder the nodes and
    # the semantics of the graph will change when we
    # have mutations in the graph
    split_gm = torch.fx.passes.split_module.split_module(
        graph,
        None,
        lambda node: node_to_subgraph_id[node],
        keep_original_order=True)

    outputs = []

    names = [name for (name, module) in split_gm.named_modules()]

    for name in names:
        if "." in name or name == "":
            # recursive child module or the root module
            continue

        module = getattr(split_gm, name)

        graph_id = int(name.replace("submod_", ""))
        outputs.append(
            SplitItem(name, graph_id, (graph_id in split_op_graphs), module))

    # sort by intetger graph_id, rather than string name
    outputs.sort(key=lambda x: x.graph_id)

    return split_gm, outputs

vllm.compilation.backends

compilation_start_time module-attribute ¶

logger module-attribute ¶

model_tag module-attribute ¶

CompilerManager ¶

cache instance-attribute ¶

compilation_config instance-attribute ¶

compiler instance-attribute ¶

is_cache_updated instance-attribute ¶

__init__ ¶

compile ¶

compute_hash ¶

initialize_cache ¶

load ¶

save_to_file ¶

PiecewiseCompileInterpreter ¶

compilation_config instance-attribute ¶

compile_submod_names instance-attribute ¶

extra_traceback instance-attribute ¶

fake_mode instance-attribute ¶

vllm_backend instance-attribute ¶

vllm_config instance-attribute ¶

__init__ ¶

call_module ¶

run ¶

SplitItem dataclass ¶

graph instance-attribute ¶

graph_id instance-attribute ¶

is_splitting_graph instance-attribute ¶

submod_name instance-attribute ¶

__init__ ¶

VllmBackend ¶

_called class-attribute instance-attribute ¶

compilation_config instance-attribute ¶

compiler_manager instance-attribute ¶

graph instance-attribute ¶

input_buffers instance-attribute ¶

piecewise_graphs instance-attribute ¶

post_grad_pass_manager instance-attribute ¶

post_grad_passes instance-attribute ¶

prefix instance-attribute ¶

returned_callable instance-attribute ¶

split_gm instance-attribute ¶

sym_tensor_indices instance-attribute ¶

vllm_config instance-attribute ¶

__call__ ¶

__init__ ¶

configure_post_pass ¶

make_compiler ¶

set_model_tag ¶

split_graph ¶

compilation_start_time `module-attribute` ¶

logger `module-attribute` ¶

model_tag `module-attribute` ¶

cache `instance-attribute` ¶

compilation_config `instance-attribute` ¶

compiler `instance-attribute` ¶

is_cache_updated `instance-attribute` ¶

init ¶

compilation_config `instance-attribute` ¶

compile_submod_names `instance-attribute` ¶

extra_traceback `instance-attribute` ¶

fake_mode `instance-attribute` ¶

vllm_backend `instance-attribute` ¶

vllm_config `instance-attribute` ¶

init ¶

SplitItem `dataclass` ¶

graph `instance-attribute` ¶

graph_id `instance-attribute` ¶

is_splitting_graph `instance-attribute` ¶

submod_name `instance-attribute` ¶

init ¶

_called `class-attribute` `instance-attribute` ¶

compilation_config `instance-attribute` ¶

compiler_manager `instance-attribute` ¶

graph `instance-attribute` ¶

input_buffers `instance-attribute` ¶

piecewise_graphs `instance-attribute` ¶

post_grad_pass_manager `instance-attribute` ¶

post_grad_passes `instance-attribute` ¶

prefix `instance-attribute` ¶

returned_callable `instance-attribute` ¶

split_gm `instance-attribute` ¶

sym_tensor_indices `instance-attribute` ¶

vllm_config `instance-attribute` ¶

call ¶

init ¶