Buckets:

download
raw
55.5 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;加速推理&quot;,&quot;local&quot;:&quot;加速推理&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;模型数据类型&quot;,&quot;local&quot;:&quot;模型数据类型&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;缩放点积注意力&quot;,&quot;local&quot;:&quot;缩放点积注意力&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;torch.compile&quot;,&quot;local&quot;:&quot;torchcompile&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;动态形状编译&quot;,&quot;local&quot;:&quot;动态形状编译&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;区域编译&quot;,&quot;local&quot;:&quot;区域编译&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;图中断&quot;,&quot;local&quot;:&quot;图中断&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;GPU同步&quot;,&quot;local&quot;:&quot;gpu同步&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;基准测试&quot;,&quot;local&quot;:&quot;基准测试&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;动态量化&quot;,&quot;local&quot;:&quot;动态量化&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;融合投影矩阵&quot;,&quot;local&quot;:&quot;融合投影矩阵&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;资源&quot;,&quot;local&quot;:&quot;资源&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/diffusers/pr_12652/zh/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/entry/start.ca7a833f.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/chunks/scheduler.e4ff9b64.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/chunks/singletons.71526a34.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/chunks/index.f9be34a7.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/chunks/paths.0df57e7f.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/entry/app.746b83f3.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/chunks/preload-helper.bb94e341.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/chunks/index.09f1bca0.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/nodes/0.8237e20e.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/nodes/25.3765bf0e.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.3bffcf96.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/chunks/CodeBlock.3dd9a65d.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/chunks/HfOption.44827c7f.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;加速推理&quot;,&quot;local&quot;:&quot;加速推理&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;模型数据类型&quot;,&quot;local&quot;:&quot;模型数据类型&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;缩放点积注意力&quot;,&quot;local&quot;:&quot;缩放点积注意力&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;torch.compile&quot;,&quot;local&quot;:&quot;torchcompile&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;动态形状编译&quot;,&quot;local&quot;:&quot;动态形状编译&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;区域编译&quot;,&quot;local&quot;:&quot;区域编译&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;图中断&quot;,&quot;local&quot;:&quot;图中断&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;GPU同步&quot;,&quot;local&quot;:&quot;gpu同步&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;基准测试&quot;,&quot;local&quot;:&quot;基准测试&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;动态量化&quot;,&quot;local&quot;:&quot;动态量化&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;融合投影矩阵&quot;,&quot;local&quot;:&quot;融合投影矩阵&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;资源&quot;,&quot;local&quot;:&quot;资源&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="加速推理" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#加速推理"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>加速推理</span></h1> <p data-svelte-h="svelte-194j5uc">Diffusion模型在推理时速度较慢,因为生成是一个迭代过程,需要经过一定数量的”步数”逐步将噪声细化为图像或视频。要加速这一过程,您可以尝试使用不同的<a href="../api/schedulers/overview">调度器</a>、降低模型权重的精度以加快计算、使用更高效的内存注意力机制等方法。</p> <p data-svelte-h="svelte-k2hqss">将这些技术组合使用,可以比单独使用任何一种技术获得更快的推理速度。</p> <p data-svelte-h="svelte-x0c89o">本指南将介绍如何加速推理。</p> <h2 class="relative group"><a id="模型数据类型" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#模型数据类型"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>模型数据类型</span></h2> <p data-svelte-h="svelte-py0rfh">模型权重的精度和数据类型会影响推理速度,因为更高的精度需要更多内存来加载,也需要更多时间进行计算。PyTorch默认以float32或全精度加载模型权重,因此更改数据类型是快速获得更快推理速度的简单方法。</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">bfloat16 </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">float16 </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">TensorFloat-32 </div></div> <div class="language-select"><p data-svelte-h="svelte-1u2chbj">bfloat16与float16类似,但对数值误差更稳健。硬件对bfloat16的支持各不相同,但大多数现代GPU都能支持bfloat16。</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.bfloat16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
prompt = <span class="hljs-string">&quot;Astronaut in a jungle, cold color palette, muted colors, detailed, 8k&quot;</span>
pipeline(prompt, num_inference_steps=<span class="hljs-number">30</span>).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> </div> <h2 class="relative group"><a id="缩放点积注意力" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#缩放点积注意力"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>缩放点积注意力</span></h2> <blockquote class="tip" data-svelte-h="svelte-8dllu1"><p>内存高效注意力优化了推理速度<em></em><a href="./memory#memory-efficient-attention">内存使用</a></p></blockquote> <p data-svelte-h="svelte-3w1pt5"><a href="https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html" rel="nofollow">缩放点积注意力(SDPA)</a>实现了多种注意力后端,包括<a href="https://github.com/Dao-AILab/flash-attention" rel="nofollow">FlashAttention</a><a href="https://github.com/facebookresearch/xformers" rel="nofollow">xFormers</a>和原生C++实现。它会根据您的硬件自动选择最优的后端。</p> <p data-svelte-h="svelte-qbrif4">如果您使用的是PyTorch &gt;= 2.0,SDPA默认启用,无需对代码进行任何额外更改。不过,您也可以尝试使用其他注意力后端来自行选择。下面的示例使用<a href="https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html" rel="nofollow">torch.nn.attention.sdpa_kernel</a>上下文管理器来启用高效注意力。</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> torch.nn.attention <span class="hljs-keyword">import</span> SDPBackend, sdpa_kernel
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.bfloat16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
prompt = <span class="hljs-string">&quot;Astronaut in a jungle, cold color palette, muted colors, detailed, 8k&quot;</span>
<span class="hljs-keyword">with</span> sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION):
image = pipeline(prompt, num_inference_steps=<span class="hljs-number">30</span>).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="torchcompile" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#torchcompile"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>torch.compile</span></h2> <p data-svelte-h="svelte-1pe18dl"><a href="https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html" rel="nofollow">torch.compile</a>通过将PyTorch代码和操作编译为优化的内核来加速推理。Diffusers通常会编译计算密集型的模型,如UNet、transformer或VAE。</p> <p data-svelte-h="svelte-g36iym">启用以下编译器设置以获得最大速度(更多选项请参阅<a href="https://github.com/pytorch/pytorch/blob/main/torch/_inductor/config.py" rel="nofollow">完整列表</a>)。</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
torch._inductor.config.conv_1x1_as_mm = <span class="hljs-literal">True</span>
torch._inductor.config.coordinate_descent_tuning = <span class="hljs-literal">True</span>
torch._inductor.config.epilogue_fusion = <span class="hljs-literal">False</span>
torch._inductor.config.coordinate_descent_check_all_directions = <span class="hljs-literal">True</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1wsgro0">加载并编译UNet和VAE。有几种不同的模式可供选择,但<code>&quot;max-autotune&quot;</code>通过编译为CUDA图来优化速度。CUDA图通过单个CPU操作启动多个GPU操作,有效减少了开销。</p> <blockquote class="tip" data-svelte-h="svelte-1fkr80r"><p>在PyTorch 2.3.1中,您可以控制torch.compile的缓存行为。这对于像<code>&quot;max-autotune&quot;</code>这样的编译模式特别有用,它会通过网格搜索多个编译标志来找到最优配置。更多详情请参阅<a href="https://pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html" rel="nofollow">torch.compile中的编译时间缓存</a>教程。</p></blockquote> <p data-svelte-h="svelte-9ijocs">将内存布局更改为<a href="./memory#torchchannels_last">channels_last</a>也可以优化内存和推理速度。</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.float16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
pipeline.unet.to(memory_format=torch.channels_last)
pipeline.vae.to(memory_format=torch.channels_last)
pipeline.unet = torch.<span class="hljs-built_in">compile</span>(
pipeline.unet, mode=<span class="hljs-string">&quot;max-autotune&quot;</span>, fullgraph=<span class="hljs-literal">True</span>
)
pipeline.vae.decode = torch.<span class="hljs-built_in">compile</span>(
pipeline.vae.decode,
mode=<span class="hljs-string">&quot;max-autotune&quot;</span>,
fullgraph=<span class="hljs-literal">True</span>
)
prompt = <span class="hljs-string">&quot;Astronaut in a jungle, cold color palette, muted colors, detailed, 8k&quot;</span>
pipeline(prompt, num_inference_steps=<span class="hljs-number">30</span>).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-15it70t">第一次编译时速度较慢,但一旦编译完成,速度会显著提升。尽量只在相同类型的推理操作上使用编译后的管道。在不同尺寸的图像上调用编译后的管道会重新触发编译,这会很慢且效率低下。</p> <h3 class="relative group"><a id="动态形状编译" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#动态形状编译"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>动态形状编译</span></h3> <blockquote class="tip" data-svelte-h="svelte-16e1eoe"><p>确保始终使用PyTorch的nightly版本以获得更好的支持。</p></blockquote> <p data-svelte-h="svelte-1ahxi03"><code>torch.compile</code>会跟踪输入形状和条件,如果这些不同,它会重新编译模型。例如,如果模型是在1024x1024分辨率的图像上编译的,而在不同分辨率的图像上使用,就会触发重新编译。</p> <p data-svelte-h="svelte-169dym5">为避免重新编译,添加<code>dynamic=True</code>以尝试生成更动态的内核,避免条件变化时重新编译。</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-addition">+ torch.fx.experimental._config.use_duck_shape = False</span>
<span class="hljs-addition">+ pipeline.unet = torch.compile(</span>
pipeline.unet, fullgraph=True, dynamic=True
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-jmoihs">指定<code>use_duck_shape=False</code>会指示编译器是否应使用相同的符号变量来表示相同大小的输入。更多详情请参阅此<a href="https://github.com/huggingface/diffusers/pull/11327#discussion_r2047659790" rel="nofollow">评论</a></p> <p data-svelte-h="svelte-w2qqjo">并非所有模型都能开箱即用地从动态编译中受益,可能需要更改。参考此<a href="https://github.com/huggingface/diffusers/pull/11297/" rel="nofollow">PR</a>,它改进了<code>AuraFlowPipeline</code>的实现以受益于动态编译。</p> <p data-svelte-h="svelte-dsv3x4">如果动态编译对Diffusers模型的效果不如预期,请随时提出问题。</p> <h3 class="relative group"><a id="区域编译" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#区域编译"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>区域编译</span></h3> <p data-svelte-h="svelte-1nyxskn"><a href="https://docs.pytorch.org/tutorials/recipes/regional_compilation.html" rel="nofollow">区域编译</a>通过仅编译模型中<em>小而频繁重复的块</em>(通常是transformer层)来减少冷启动延迟,并为每个后续出现的块重用编译后的工件。对于许多diffusion架构,这提供了与全图编译相同的运行时加速,并将编译时间减少了8-10倍。</p> <p data-svelte-h="svelte-1qqd039">使用<code>compile_repeated_blocks()</code>方法(一个包装<code>torch.compile</code>的辅助函数)在任何组件(如transformer模型)上,如下所示。</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># pip install -U diffusers</span>
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>,
torch_dtype=torch.float16,
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-comment"># 仅编译UNet中重复的transformer层</span>
pipeline.unet.compile_repeated_blocks(fullgraph=<span class="hljs-literal">True</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-wmi0r6">要为新模型启用区域编译,请在模型类中添加一个<code>_repeated_blocks</code>属性,包含您想要编译的块的类名(作为字符串)。</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">class</span> <span class="hljs-title class_">MyUNet</span>(<span class="hljs-title class_ inherited__">ModelMixin</span>):
_repeated_blocks = (<span class="hljs-string">&quot;Transformer2DModel&quot;</span>,) <span class="hljs-comment"># ← 默认编译</span><!-- HTML_TAG_END --></pre></div> <blockquote class="tip" data-svelte-h="svelte-1tna39"><p>更多区域编译示例,请参阅参考<a href="https://github.com/huggingface/diffusers/pull/11705" rel="nofollow">PR</a></p></blockquote> <p data-svelte-h="svelte-6v6dgf"><a href="https://huggingface.co/docs/accelerate/index" rel="nofollow">Accelerate</a>中还有一个<a href="https://github.com/huggingface/accelerate/blob/273799c85d849a1954a4f2e65767216eb37fa089/src/accelerate/utils/other.py#L78" rel="nofollow">compile_regions</a>方法,可以自动选择模型中的候选块进行编译。其余图会单独编译。这对于快速实验很有用,因为您不需要设置哪些块要编译或调整编译标志。</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># pip install -U accelerate</span>
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
<span class="hljs-keyword">from</span> accelerate.utils <span class="hljs-keyword">import</span> <span class="hljs-built_in">compile</span> regions
pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.float16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
pipeline.unet = compile_regions(pipeline.unet, mode=<span class="hljs-string">&quot;reduce-overhead&quot;</span>, fullgraph=<span class="hljs-literal">True</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-zlqqst"><code>compile_repeated_blocks()</code>是故意显式的。在<code>_repeated_blocks</code>中列出要重复的块,辅助函数仅编译这些块。它提供了可预测的行为,并且只需一行代码即可轻松推理缓存重用。</p> <h3 class="relative group"><a id="图中断" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#图中断"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>图中断</span></h3> <p data-svelte-h="svelte-qa92su">在torch.compile中指定<code>fullgraph=True</code>非常重要,以确保底层模型中没有图中断。这使您可以充分利用torch.compile而不会降低性能。对于UNet和VAE,这会改变您访问返回变量的方式。</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-deletion">- latents = unet(</span>
<span class="hljs-deletion">- latents, timestep=timestep, encoder_hidden_states=prompt_embeds</span>
<span class="hljs-deletion">-).sample</span>
<span class="hljs-addition">+ latents = unet(</span>
<span class="hljs-addition">+ latents, timestep=timestep, encoder_hidden_states=prompt_embeds, return_dict=False</span>
<span class="hljs-addition">+)[0]</span><!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="gpu同步" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#gpu同步"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>GPU同步</span></h3> <p data-svelte-h="svelte-jzcid">每次去噪器做出预测后,调度器的<code>step()</code>函数会被<a href="https://github.com/huggingface/diffusers/blob/1d686bac8146037e97f3fd8c56e4063230f71751/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py#L1228" rel="nofollow">调用</a>,并且<code>sigmas</code>变量会被<a href="https://github.com/huggingface/diffusers/blob/1d686bac8146037e97f3fd8c56e4063230f71751/src/diffusers/schedulers/scheduling_euler_discrete.py#L476" rel="nofollow">索引</a>。当放在GPU上时,这会引入延迟,因为CPU和GPU之间需要进行通信同步。当去噪器已经编译时,这一点会更加明显。</p> <p data-svelte-h="svelte-11fadgl">一般来说,<code>sigmas</code>应该<a href="https://github.com/huggingface/diffusers/blob/35a969d297cba69110d175ee79c59312b9f49e1e/src/diffusers/schedulers/scheduling_euler_discrete.py#L240" rel="nofollow">保持在CPU上</a>,以避免通信同步和延迟。</p> <blockquote class="tip" data-svelte-h="svelte-yipkr8"><p>参阅<a href="https://pytorch.org/blog/torch-compile-and-diffusers-a-hands-on-guide-to-peak-performance/" rel="nofollow">torch.compile和Diffusers:峰值性能实践指南</a>博客文章,了解如何为扩散模型最大化<code>torch.compile</code>的性能。</p></blockquote> <h3 class="relative group"><a id="基准测试" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#基准测试"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>基准测试</span></h3> <p data-svelte-h="svelte-h31nps">参阅<a href="https://huggingface.co/datasets/diffusers/benchmarks" rel="nofollow">diffusers/benchmarks</a>数据集,查看编译管道的推理延迟和内存使用数据。</p> <p data-svelte-h="svelte-oh3qub"><a href="https://github.com/sayakpaul/diffusers-torchao#benchmarking-results" rel="nofollow">diffusers-torchao</a>仓库还包含Flux和CogVideoX编译版本的基准测试结果。</p> <h2 class="relative group"><a id="动态量化" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#动态量化"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>动态量化</span></h2> <p data-svelte-h="svelte-1esyvy8"><a href="https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html" rel="nofollow">动态量化</a>通过降低精度以加快数学运算来提高推理速度。这种特定类型的量化在运行时根据数据确定如何缩放激活,而不是使用固定的缩放因子。因此,缩放因子与数据更准确地匹配。</p> <p data-svelte-h="svelte-10dayln">以下示例使用<a href="../quantization/torchao">torchao</a>库对UNet和VAE应用<a href="https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html" rel="nofollow">动态int8量化</a></p> <blockquote class="tip" data-svelte-h="svelte-1xrzhvm"><p>参阅我们的<a href="../quantization/torchao">torchao</a>文档,了解更多关于如何使用Diffusers torchao集成的信息。</p></blockquote> <p data-svelte-h="svelte-19bhy1d">配置编译器标志以获得最大速度。</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> torchao <span class="hljs-keyword">import</span> apply_dynamic_quant
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLPipeline
torch._inductor.config.conv_1x1_as_mm = <span class="hljs-literal">True</span>
torch._inductor.config.coordinate_descent_tuning = <span class="hljs-literal">True</span>
torch._inductor.config.epilogue_fusion = <span class="hljs-literal">False</span>
torch._inductor.config.coordinate_descent_check_all_directions = <span class="hljs-literal">True</span>
torch._inductor.config.force_fuse_int_mm_with_mul = <span class="hljs-literal">True</span>
torch._inductor.config.use_mixed_mm = <span class="hljs-literal">True</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-17j89ie">使用<a href="https://github.com/huggingface/diffusion-fast/blob/0f169640b1db106fe6a479f78c1ed3bfaeba3386/utils/pipeline_utils.py#L16" rel="nofollow">dynamic_quant_filter_fn</a>过滤掉UNet和VAE中一些不会从动态量化中受益的线性层。</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pipeline = StableDiffusionXLPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, torch_dtype=torch.bfloat16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
apply_dynamic_quant(pipeline.unet, dynamic_quant_filter_fn)
apply_dynamic_quant(pipeline.vae, dynamic_quant_filter_fn)
prompt = <span class="hljs-string">&quot;Astronaut in a jungle, cold color palette, muted colors, detailed, 8k&quot;</span>
pipeline(prompt, num_inference_steps=<span class="hljs-number">30</span>).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="融合投影矩阵" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#融合投影矩阵"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>融合投影矩阵</span></h2> <blockquote data-svelte-h="svelte-tty6r3"><p>[!WARNING][fuse_qkv_projections](<a href="https://github.com/huggingface/diffusers/blob/58431f102cf39c3c8a569f32d71b2ea8caa461e1/src/diffusers/pipelines/pipeline_utils.py#L2034)%E6%96%B9%E6%B3%95%E6%98%AF%E5%AE%9E%E9%AA%8C%E6%80%A7%E7%9A%84%EF%BC%8C%E7%9B%AE%E5%89%8D%E4%B8%BB%E8%A6%81%E6%94%AF%E6%8C%81Stable" rel="nofollow">https://github.com/huggingface/diffusers/blob/58431f102cf39c3c8a569f32d71b2ea8caa461e1/src/diffusers/pipelines/pipeline_utils.py#L2034)方法是实验性的,目前主要支持Stable</a> Diffusion管道。参阅此<a href="https://github.com/huggingface/diffusers/pull/6179" rel="nofollow">PR</a>了解如何为其他管道启用它。</p></blockquote> <p data-svelte-h="svelte-qv5ldk">在注意力块中,输入被投影到三个子空间,分别由投影矩阵Q、K和V表示。这些投影通常单独计算,但您可以水平组合这些矩阵为一个矩阵,并在单步中执行投影。这会增加输入投影的矩阵乘法大小,并提高量化的效果。</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pipeline.fuse_qkv_projections()<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="资源" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#资源"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>资源</span></h2> <ul data-svelte-h="svelte-58n17n"><li><p>阅读<a href="https://pytorch.org/blog/presenting-flux-fast-making-flux-go-brrr-on-h100s/" rel="nofollow">Presenting Flux Fast: Making Flux go brrr on H100s</a>博客文章,了解如何结合所有这些优化与<a href="https://docs.pytorch.org/docs/stable/torch.compiler.html" rel="nofollow">TorchInductor</a><a href="https://docs.pytorch.org/docs/stable/torch.compiler_aot_inductor.html" rel="nofollow">AOTInductor</a>,使用<a href="https://github.com/huggingface/flux-fast" rel="nofollow">flux-fast</a>的配方获得约2.5倍的加速。</p> <p>这些配方支持AMD硬件和<a href="https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev" rel="nofollow">Flux.1 Kontext Dev</a></p></li> <li><p>阅读<a href="https://pytorch.org/blog/torch-compile-and-diffusers-a-hands-on-guide-to-peak-performance/" rel="nofollow">torch.compile和Diffusers:峰值性能实践指南</a>博客文章,了解如何在使用<code>torch.compile</code>时最大化性能。</p></li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/diffusers/blob/main/docs/source/zh/optimization/fp16.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_oaf0l2 = {
assets: "/docs/diffusers/pr_12652/zh",
base: "/docs/diffusers/pr_12652/zh",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/diffusers/pr_12652/zh/_app/immutable/entry/start.ca7a833f.js"),
import("/docs/diffusers/pr_12652/zh/_app/immutable/entry/app.746b83f3.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 25],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
55.5 kB
·
Xet hash:
7db74a7bb4bb69075dde71e52c770dfaae82ef633e57c9d2c8f06c98d5573f70

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.