Buckets:

download
raw
35.8 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;ParaAttention&quot;,&quot;local&quot;:&quot;paraattention&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;第一块缓存&quot;,&quot;local&quot;:&quot;第一块缓存&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;fp8 量化&quot;,&quot;local&quot;:&quot;fp8-量化&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;上下文并行性&quot;,&quot;local&quot;:&quot;上下文并行性&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;基准测试&quot;,&quot;local&quot;:&quot;基准测试&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/diffusers/pr_12652/zh/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/entry/start.ca7a833f.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/chunks/scheduler.e4ff9b64.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/chunks/singletons.71526a34.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/chunks/index.f9be34a7.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/chunks/paths.0df57e7f.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/entry/app.746b83f3.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/chunks/preload-helper.bb94e341.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/chunks/index.09f1bca0.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/nodes/0.8237e20e.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/nodes/32.37c0b9c7.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.3bffcf96.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/chunks/CodeBlock.3dd9a65d.js">
<link rel="modulepreload" href="/docs/diffusers/pr_12652/zh/_app/immutable/chunks/HfOption.44827c7f.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;ParaAttention&quot;,&quot;local&quot;:&quot;paraattention&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;第一块缓存&quot;,&quot;local&quot;:&quot;第一块缓存&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;fp8 量化&quot;,&quot;local&quot;:&quot;fp8-量化&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;上下文并行性&quot;,&quot;local&quot;:&quot;上下文并行性&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;基准测试&quot;,&quot;local&quot;:&quot;基准测试&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="paraattention" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#paraattention"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ParaAttention</span></h1> <div class="flex justify-center" data-svelte-h="svelte-1p4slnk"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-performance.png"></div> <div class="flex justify-center" data-svelte-h="svelte-1kqq4mt"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/hunyuan-video-performance.png"></div> <p data-svelte-h="svelte-1aznnck">大型图像和视频生成模型,如 <a href="https://huggingface.co/black-forest-labs/FLUX.1-dev" rel="nofollow">FLUX.1-dev</a><a href="https://huggingface.co/tencent/HunyuanVideo" rel="nofollow">HunyuanVideo</a>,由于其规模,可能对实时应用和部署构成推理挑战。</p> <p data-svelte-h="svelte-nxzfeq"><a href="https://github.com/chengzeyi/ParaAttention" rel="nofollow">ParaAttention</a> 是一个实现了<strong>上下文并行</strong><strong>第一块缓存</strong>的库,可以与其他技术(如 torch.compile、fp8 动态量化)结合使用,以加速推理。</p> <p data-svelte-h="svelte-ckrkoc">本指南将展示如何在 NVIDIA L20 GPU 上对 FLUX.1-dev 和 HunyuanVideo 应用 ParaAttention。
在我们的基线基准测试中,除了 HunyuanVideo 为避免内存不足错误外,未应用任何优化。</p> <p data-svelte-h="svelte-wz0gnx">我们的基线基准测试显示,FLUX.1-dev 能够在 28 步中生成 1024x1024 分辨率图像,耗时 26.36 秒;HunyuanVideo 能够在 30 步中生成 129 帧 720p 分辨率视频,耗时 3675.71 秒。</p> <blockquote class="tip" data-svelte-h="svelte-m6tkds"><p>对于更快的上下文并行推理,请尝试使用支持 NVLink 的 NVIDIA A100 或 H100 GPU(如果可用),尤其是在 GPU 数量较多时。</p></blockquote> <h2 class="relative group"><a id="第一块缓存" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#第一块缓存"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>第一块缓存</span></h2> <p data-svelte-h="svelte-17ussag">缓存模型中 transformer 块的输出并在后续推理步骤中重用它们,可以降低计算成本并加速推理。</p> <p data-svelte-h="svelte-1dduett">然而,很难决定何时重用缓存以确保生成图像或视频的质量。ParaAttention 直接使用<strong>第一个 transformer 块输出的残差差异</strong>来近似模型输出之间的差异。当差异足够小时,重用先前推理步骤的残差差异。换句话说,跳过去噪步骤。</p> <p data-svelte-h="svelte-1y9k7c">这在 FLUX.1-dev 和 HunyuanVideo 推理上实现了 2 倍加速,且质量非常好。</p> <figure data-svelte-h="svelte-1cjjxth"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/ada-cache.png" alt="Cache in Diffusion Transformer"> <figcaption>AdaCache 的工作原理,第一块缓存是其变体</figcaption></figure> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">FLUX-1.dev </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">HunyuanVideo </div></div> <div class="language-select"><p data-svelte-h="svelte-13o107o">要在 FLUX.1-dev 上应用第一块缓存,请调用 <code>apply_cache_on_pipe</code>,如下所示。0.08 是 FLUX 模型的默认残差差异值。</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> time
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> FluxPipeline
pipe = FluxPipeline.from_pretrained(
<span class="hljs-string">&quot;black-forest-labs/FLUX.1-dev&quot;</span>,
torch_dtype=torch.bfloat16,
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-keyword">from</span> para_attn.first_block_cache.diffusers_adapters <span class="hljs-keyword">import</span> apply_cache_on_pipe
apply_cache_on_pipe(pipe, residual_diff_thre
shold=<span class="hljs-number">0.08</span>)
<span class="hljs-comment"># 启用内存节省</span>
<span class="hljs-comment"># pipe.enable_model_cpu_offload()</span>
<span class="hljs-comment"># pipe.enable_sequential_cpu_offload()</span>
begin = time.time()
image = pipe(
<span class="hljs-string">&quot;A cat holding a sign that says hello world&quot;</span>,
num_inference_steps=<span class="hljs-number">28</span>,
).images[<span class="hljs-number">0</span>]
end = time.time()
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Time: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;Saving image to flux.png&quot;</span>)
image.save(<span class="hljs-string">&quot;flux.png&quot;</span>)<!-- HTML_TAG_END --></pre></div> <table data-svelte-h="svelte-9gupjq"><thead><tr><th>优化</th> <th>原始</th> <th>FBCache rdt=0.06</th> <th>FBCache rdt=0.08</th> <th>FBCache rdt=0.10</th> <th>FBCache rdt=0.12</th></tr></thead> <tbody><tr><td>预览</td> <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-original.png" alt="Original"></td> <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-fbc-0.06.png" alt="FBCache rdt=0.06"></td> <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-fbc-0.08.png" alt="FBCache rdt=0.08"></td> <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-fbc-0.10.png" alt="FBCache rdt=0.10"></td> <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-fbc-0.12.png" alt="FBCache rdt=0.12"></td></tr> <tr><td>墙时间 (s)</td> <td>26.36</td> <td>21.83</td> <td>17.01</td> <td>16.00</td> <td>13.78</td></tr></tbody></table> <p data-svelte-h="svelte-59xgrt">First Block Cache 将推理速度降低到 17.01 秒,与基线相比,或快 1.55 倍,同时保持几乎零质量损失。</p> </div> <h2 class="relative group"><a id="fp8-量化" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#fp8-量化"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>fp8 量化</span></h2> <p data-svelte-h="svelte-iz99fc">fp8 动态量化进一步加速推理并减少内存使用。为了使用 8 位 <a href="https://www.nvidia.com/en-us/data-center/tensor-cores/" rel="nofollow">NVIDIA Tensor Cores</a>,必须对激活和权重进行量化。</p> <p data-svelte-h="svelte-3fzx9w">使用 <code>float8_weight_only</code><code>float8_dynamic_activation_float8_weight</code> 来量化文本编码器和变换器模型。</p> <p data-svelte-h="svelte-f45i0k">默认量化方法是逐张量量化,但如果您的 GPU 支持逐行量化,您也可以尝试它以获得更好的准确性。</p> <p data-svelte-h="svelte-1s76vee">使用以下命令安装 <a href="https://github.com/pytorch/ao/tree/main" rel="nofollow">torchao</a></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pip3 install -U torch torchao<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ubr0pd"><a href="https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html" rel="nofollow">torch.compile</a> 使用 <code>mode=&quot;max-autotune-no-cudagraphs&quot;</code><code>mode=&quot;max-autotune&quot;</code> 选择最佳内核以获得性能。如果是第一次调用模型,编译可能会花费很长时间,但一旦模型编译完成,这是值得的。</p> <p data-svelte-h="svelte-1w4d2au">此示例仅量化变换器模型,但您也可以量化文本编码器以进一步减少内存使用。</p> <blockquote class="tip" data-svelte-h="svelte-5igzue"><p>动态量化可能会显著改变模型输出的分布,因此您需要将 <code>residual_diff_threshold</code> 设置为更大的值以使其生效。</p></blockquote> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">FLUX-1.dev </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">HunyuanVideo </div></div> <div class="language-select"><div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> time
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> FluxPipeline
pipe = FluxPipeline.from_pretrained(
<span class="hljs-string">&quot;black-forest-labs/FLUX.1-dev&quot;</span>,
torch_dtype=torch.bfloat16,
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-keyword">from</span> para_attn.first_block_cache.diffusers_adapters <span class="hljs-keyword">import</span> apply_cache_on_pipe
apply_cache_on_pipe(
pipe,
residual_diff_threshold=<span class="hljs-number">0.12</span>, <span class="hljs-comment"># 使用更大的值以使缓存生效</span>
)
<span class="hljs-keyword">from</span> torchao.quantization <span class="hljs-keyword">import</span> quantize_, float8_dynamic_activation_float8_weight, float8_weight_only
quantize_(pipe.text_encoder, float8_weight_only())
quantize_(pipe.transformer, float8_dynamic_activation_float8_weight())
pipe.transformer = torch.<span class="hljs-built_in">compile</span>(
pipe.transformer, mode=<span class="hljs-string">&quot;max-autotune-no-cudagraphs&quot;</span>,
)
<span class="hljs-comment"># 启用内存节省</span>
<span class="hljs-comment"># pipe.enable_model_cpu_offload()</span>
<span class="hljs-comment"># pipe.enable_sequential_cpu_offload()</span>
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">2</span>):
begin = time.time()
image = pipe(
<span class="hljs-string">&quot;A cat holding a sign that says hello world&quot;</span>,
num_inference_steps=<span class="hljs-number">28</span>,
).images[<span class="hljs-number">0</span>]
end = time.time()
<span class="hljs-keyword">if</span> i == <span class="hljs-number">0</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;预热时间: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-keyword">else</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;时间: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;保存图像到 flux.png&quot;</span>)
image.save(<span class="hljs-string">&quot;flux.png&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-vnhv5y">fp8 动态量化和 torch.compile 将推理速度降低至 7.56 秒,相比基线快了 3.48 倍。</p> </div> <h2 class="relative group"><a id="上下文并行性" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#上下文并行性"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>上下文并行性</span></h2> <p data-svelte-h="svelte-125hian">上下文并行性并行化推理并随多个 GPU 扩展。ParaAttention 组合设计允许您将上下文并行性与第一块缓存和动态量化结合使用。</p> <blockquote class="tip" data-svelte-h="svelte-zcqlrq"><p>请参考 <a href="https://github.com/chengzeyi/ParaAttention/tree/main" rel="nofollow">ParaAttention</a> 仓库获取详细说明和如何使用多个 GPU 扩展推理的示例。</p></blockquote> <p data-svelte-h="svelte-gwlw48">如果推理过程需要持久化和可服务,建议使用 <a href="https://pytorch.org/docs/stable/multiprocessing.html" rel="nofollow">torch.multiprocessing</a> 编写您自己的推理处理器。这可以消除启动进程以及加载和重新编译模型的开销。</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">FLUX-1.dev </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">HunyuanVideo </div></div> <div class="language-select"><p data-svelte-h="svelte-1wcnxcs">以下代码示例结合了第一块缓存、fp8动态量化、torch.compile和上下文并行,以实现最快的推理速度。</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> time
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">import</span> torch.distributed <span class="hljs-keyword">as</span> dist
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> FluxPipeline
dist.init_process_group()
torch.cuda.set_device(dist.get_rank())
pipe = FluxPipeline.from_pretrained(
<span class="hljs-string">&quot;black-forest-labs/FLUX.1-dev&quot;</span>,
torch_dtype=torch.bfloat16,
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-keyword">from</span> para_attn.context_parallel <span class="hljs-keyword">import</span> init_context_parallel_mesh
<span class="hljs-keyword">from</span> para_attn.context_parallel.diffusers_adapters <span class="hljs-keyword">import</span> parallelize_pipe
<span class="hljs-keyword">from</span> para_attn.parallel_vae.diffusers_adapters <span class="hljs-keyword">import</span> parallelize_vae
mesh = init_context_parallel_mesh(
pipe.device.<span class="hljs-built_in">type</span>,
max_ring_dim_size=<span class="hljs-number">2</span>,
)
parallelize_pipe(
pipe,
mesh=mesh,
)
parallelize_vae(pipe.vae, mesh=mesh._flatten())
<span class="hljs-keyword">from</span> para_attn.first_block_cache.diffusers_adapters <span class="hljs-keyword">import</span> apply_cache_on_pipe
apply_cache_on_pipe(
pipe,
residual_diff_threshold=<span class="hljs-number">0.12</span>, <span class="hljs-comment"># 使用较大的值以使缓存生效</span>
)
<span class="hljs-keyword">from</span> torchao.quantization <span class="hljs-keyword">import</span> quantize_, float8_dynamic_activation_float8_weight, float8_weight_only
quantize_(pipe.text_encoder, float8_weight_only())
quantize_(pipe.transformer, float8_dynamic_activation_float8_weight())
torch._inductor.config.reorder_for_compute_comm_overlap = <span class="hljs-literal">True</span>
pipe.transformer = torch.<span class="hljs-built_in">compile</span>(
pipe.transformer, mode=<span class="hljs-string">&quot;max-autotune-no-cudagraphs&quot;</span>,
)
<span class="hljs-comment"># 启用内存节省</span>
<span class="hljs-comment"># pipe.enable_model_cpu_offload(gpu_id=dist.get_rank())</span>
<span class="hljs-comment"># pipe.enable_sequential_cpu_offload(gpu_id=dist.get_rank())</span>
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">2</span>):
begin = time.time()
image = pipe(
<span class="hljs-string">&quot;A cat holding a sign that says hello world&quot;</span>,
num_inference_steps=<span class="hljs-number">28</span>,
output_type=<span class="hljs-string">&quot;pil&quot;</span> <span class="hljs-keyword">if</span> dist.get_rank() == <span class="hljs-number">0</span> <span class="hljs-keyword">else</span> <span class="hljs-string">&quot;pt&quot;</span>,
).images[<span class="hljs-number">0</span>]
end = time.time()
<span class="hljs-keyword">if</span> dist.get_rank() == <span class="hljs-number">0</span>:
<span class="hljs-keyword">if</span> i == <span class="hljs-number">0</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;预热时间: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-keyword">else</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;时间: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-keyword">if</span> dist.get_rank() == <span class="hljs-number">0</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;将图像保存到flux.png&quot;</span>)
image.save(<span class="hljs-string">&quot;flux.png&quot;</span>)
dist.destroy_process_group()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-bdq9oz">保存到<code>run_flux.py</code>并使用<a href="https://pytorch.org/docs/stable/elastic/run.html" rel="nofollow">torchrun</a>启动。</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># 使用--nproc_per_node指定GPU数量</span>
torchrun --nproc_per_node=2 run_flux.py<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-19q14lf">推理速度降至8.20秒,相比基线快了3.21倍,使用2个NVIDIA L20 GPU。在4个L20上,推理速度为3.90秒,快了6.75倍。</p> </div> <h2 class="relative group"><a id="基准测试" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#基准测试"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>基准测试</span></h2> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">FLUX-1.dev </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">HunyuanVideo </div></div> <div class="language-select"><table data-svelte-h="svelte-qm3hdc"><thead><tr><th>GPU 类型</th> <th>GPU 数量</th> <th>优化</th> <th>墙钟时间 (s)</th> <th>加速比</th></tr></thead> <tbody><tr><td>NVIDIA L20</td> <td>1</td> <td>基线</td> <td>26.36</td> <td>1.00x</td></tr> <tr><td>NVIDIA L20</td> <td>1</td> <td>FBCache (rdt=0.08)</td> <td>17.01</td> <td>1.55x</td></tr> <tr><td>NVIDIA L20</td> <td>1</td> <td>FP8 DQ</td> <td>13.40</td> <td>1.96x</td></tr> <tr><td>NVIDIA L20</td> <td>1</td> <td>FBCache (rdt=0.12) + FP8 DQ</td> <td>7.56</td> <td>3.48x</td></tr> <tr><td>NVIDIA L20</td> <td>2</td> <td>FBCache (rdt=0.12) + FP8 DQ + CP</td> <td>4.92</td> <td>5.35x</td></tr> <tr><td>NVIDIA L20</td> <td>4</td> <td>FBCache (rdt=0.12) + FP8 DQ + CP</td> <td>3.90</td> <td>6.75x</td></tr></tbody></table> </div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/diffusers/blob/main/docs/source/zh/optimization/para_attn.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_oaf0l2 = {
assets: "/docs/diffusers/pr_12652/zh",
base: "/docs/diffusers/pr_12652/zh",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/diffusers/pr_12652/zh/_app/immutable/entry/start.ca7a833f.js"),
import("/docs/diffusers/pr_12652/zh/_app/immutable/entry/app.746b83f3.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 32],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
35.8 kB
·
Xet hash:
914c91234c5dff97405e55d4bc411dc10ac90175944e56d63f4eeadb448da56f

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.