56.8 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"메모리와 속도","local":"메모리와-속도","sections":[{"title":"cuDNN auto-tuner 활성화하기","local":"cudnn-auto-tuner-활성화하기","sections":[{"title":"fp32 대신 tf32 사용하기 (Ampere 및 이후 CUDA 장치들에서)","local":"fp32-대신-tf32-사용하기-ampere-및-이후-cuda-장치들에서","sections":[],"depth":3}],"depth":2},{"title":"반정밀도 가중치","local":"반정밀도-가중치","sections":[],"depth":2},{"title":"추가 메모리 절약을 위한 슬라이스 어텐션","local":"추가-메모리-절약을-위한-슬라이스-어텐션","sections":[],"depth":2},{"title":"더 큰 배치를 위한 sliced VAE 디코드","local":"더-큰-배치를-위한-sliced-vae-디코드","sections":[],"depth":2},{"title":"Channels Last 메모리 형식 사용하기","local":"channels-last-메모리-형식-사용하기","sections":[],"depth":2},{"title":"추적(tracing)","local":"추적tracing","sections":[],"depth":2},{"title":"Memory-efficient attention","local":"memory-efficient-attention","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/diffusers/pr_11234/ko/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/diffusers/pr_11234/ko/_app/immutable/entry/start.17c9338f.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11234/ko/_app/immutable/chunks/scheduler.94020406.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11234/ko/_app/immutable/chunks/singletons.92825be8.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11234/ko/_app/immutable/chunks/index.8b553f6b.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11234/ko/_app/immutable/chunks/paths.84feebad.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11234/ko/_app/immutable/entry/app.0ba20c84.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11234/ko/_app/immutable/chunks/index.a08c8d92.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11234/ko/_app/immutable/nodes/0.6f6037b1.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11234/ko/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11234/ko/_app/immutable/nodes/11.3a8f04f4.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11234/ko/_app/immutable/chunks/Tip.3b0aeee8.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11234/ko/_app/immutable/chunks/CodeBlock.f1fae7de.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11234/ko/_app/immutable/chunks/index.9fb21c13.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"메모리와 속도","local":"메모리와-속도","sections":[{"title":"cuDNN auto-tuner 활성화하기","local":"cudnn-auto-tuner-활성화하기","sections":[{"title":"fp32 대신 tf32 사용하기 (Ampere 및 이후 CUDA 장치들에서)","local":"fp32-대신-tf32-사용하기-ampere-및-이후-cuda-장치들에서","sections":[],"depth":3}],"depth":2},{"title":"반정밀도 가중치","local":"반정밀도-가중치","sections":[],"depth":2},{"title":"추가 메모리 절약을 위한 슬라이스 어텐션","local":"추가-메모리-절약을-위한-슬라이스-어텐션","sections":[],"depth":2},{"title":"더 큰 배치를 위한 sliced VAE 디코드","local":"더-큰-배치를-위한-sliced-vae-디코드","sections":[],"depth":2},{"title":"Channels Last 메모리 형식 사용하기","local":"channels-last-메모리-형식-사용하기","sections":[],"depth":2},{"title":"추적(tracing)","local":"추적tracing","sections":[],"depth":2},{"title":"Memory-efficient attention","local":"memory-efficient-attention","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="메모리와-속도" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#메모리와-속도"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>메모리와 속도</span></h1> <p data-svelte-h="svelte-m9figs">메모리 또는 속도에 대해 🤗 Diffusers <em>추론</em>을 최적화하기 위한 몇 가지 기술과 아이디어를 제시합니다.
	일반적으로, memory-efficient attention을 위해 <a href="https://github.com/facebookresearch/xformers" rel="nofollow">xFormers</a> 사용을 추천하기 때문에, 추천하는 <a href="xformers">설치 방법</a>을 보고 설치해 보세요.</p> <p data-svelte-h="svelte-12e7d13">다음 설정이 성능과 메모리에 미치는 영향에 대해 설명합니다.</p> <table data-svelte-h="svelte-1sy2nlq"><thead><tr><th></th> <th>지연시간</th> <th>속도 향상</th></tr></thead> <tbody><tr><td>별도 설정 없음</td> <td>9.50s</td> <td>x1</td></tr> <tr><td>cuDNN auto-tuner</td> <td>9.37s</td> <td>x1.01</td></tr> <tr><td>fp16</td> <td>3.61s</td> <td>x2.63</td></tr> <tr><td>Channels Last 메모리 형식</td> <td>3.30s</td> <td>x2.88</td></tr> <tr><td>traced UNet</td> <td>3.21s</td> <td>x2.96</td></tr> <tr><td>memory-efficient attention</td> <td>2.63s</td> <td>x3.61</td></tr></tbody></table> <em data-svelte-h="svelte-1iy2bqt">NVIDIA TITAN RTX에서 50 DDIM 스텝의 "a photo of an astronaut riding a horse on mars" 프롬프트로 512x512 크기의 단일 이미지를 생성하였습니다.</em> <h2 class="relative group"><a id="cudnn-auto-tuner-활성화하기" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#cudnn-auto-tuner-활성화하기"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>cuDNN auto-tuner 활성화하기</span></h2> <p data-svelte-h="svelte-1q6vy6i"><a href="https://developer.nvidia.com/cudnn" rel="nofollow">NVIDIA cuDNN</a>은 컨볼루션을 계산하는 많은 알고리즘을 지원합니다. Autotuner는 짧은 벤치마크를 실행하고 주어진 입력 크기에 대해 주어진 하드웨어에서 최고의 성능을 가진 커널을 선택합니다.</p> <p data-svelte-h="svelte-1xwtu1p"><strong>컨볼루션 네트워크</strong>를 활용하고 있기 때문에 (다른 유형들은 현재 지원되지 않음), 다음 설정을 통해 추론 전에 cuDNN autotuner를 활성화할 수 있습니다:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch

	torch.backends.cudnn.benchmark = <span class="hljs-literal">True</span><!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="fp32-대신-tf32-사용하기-ampere-및-이후-cuda-장치들에서" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#fp32-대신-tf32-사용하기-ampere-및-이후-cuda-장치들에서"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>fp32 대신 tf32 사용하기 (Ampere 및 이후 CUDA 장치들에서)</span></h3> <p data-svelte-h="svelte-1gm9l4i">Ampere 및 이후 CUDA 장치에서 행렬곱 및 컨볼루션은 TensorFloat32(TF32) 모드를 사용하여 더 빠르지만 약간 덜 정확할 수 있습니다.
	기본적으로 PyTorch는 컨볼루션에 대해 TF32 모드를 활성화하지만 행렬 곱셈은 활성화하지 않습니다.
	네트워크에 완전한 float32 정밀도가 필요한 경우가 아니면 행렬 곱셈에 대해서도 이 설정을 활성화하는 것이 좋습니다.
	이는 일반적으로 무시할 수 있는 수치의 정확도 손실이 있지만, 계산 속도를 크게 높일 수 있습니다.
	그것에 대해 <a href="https://huggingface.co/docs/transformers/v4.18.0/en/performance#tf32" rel="nofollow">여기</a>서 더 읽을 수 있습니다.
	추론하기 전에 다음을 추가하기만 하면 됩니다:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch

	torch.backends.cuda.matmul.allow_tf32 = <span class="hljs-literal">True</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="반정밀도-가중치" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#반정밀도-가중치"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>반정밀도 가중치</span></h2> <p data-svelte-h="svelte-1y9jw5r">더 많은 GPU 메모리를 절약하고 더 빠른 속도를 얻기 위해 모델 가중치를 반정밀도(half precision)로 직접 불러오고 실행할 수 있습니다.
	여기에는 <code>fp16</code>이라는 브랜치에 저장된 float16 버전의 가중치를 불러오고, 그 때 <code>float16</code> 유형을 사용하도록 PyTorch에 지시하는 작업이 포함됩니다.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pipe = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>,

	torch_dtype=torch.float16,
	)
	pipe = pipe.to(<span class="hljs-string">"cuda"</span>)

	prompt = <span class="hljs-string">"a photo of an astronaut riding a horse on mars"</span>
	image = pipe(prompt).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400">어떤 파이프라인에서도 [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) 를 사용하는 것은 검은색 이미지를 생성할 수 있고, 순수한 float16 정밀도를 사용하는 것보다 항상 느리기 때문에 사용하지 않는 것이 좋습니다.</div> <h2 class="relative group"><a id="추가-메모리-절약을-위한-슬라이스-어텐션" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#추가-메모리-절약을-위한-슬라이스-어텐션"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>추가 메모리 절약을 위한 슬라이스 어텐션</span></h2> <p data-svelte-h="svelte-97drxa">추가 메모리 절약을 위해, 한 번에 모두 계산하는 대신 단계적으로 계산을 수행하는 슬라이스 버전의 어텐션(attention)을 사용할 수 있습니다.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400">Attention slicing은 모델이 하나 이상의 어텐션 헤드를 사용하는 한, 배치 크기가 1인 경우에도 유용합니다.
	하나 이상의 어텐션 헤드가 있는 경우 QK^T 어텐션 매트릭스는 상당한 양의 메모리를 절약할 수 있는 각 헤드에 대해 순차적으로 계산될 수 있습니다.</div> <p data-svelte-h="svelte-19j5lzh">각 헤드에 대해 순차적으로 어텐션 계산을 수행하려면, 다음과 같이 추론 전에 파이프라인에서 <code>enable_attention_slicing()</code>를 호출하면 됩니다:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline

	pipe = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>,

	torch_dtype=torch.float16,
	)
	pipe = pipe.to(<span class="hljs-string">"cuda"</span>)

	prompt = <span class="hljs-string">"a photo of an astronaut riding a horse on mars"</span>
	pipe.enable_attention_slicing()
	image = pipe(prompt).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1809mre">추론 시간이 약 10% 느려지는 약간의 성능 저하가 있지만 이 방법을 사용하면 3.2GB 정도의 작은 VRAM으로도 Stable Diffusion을 사용할 수 있습니다!</p> <h2 class="relative group"><a id="더-큰-배치를-위한-sliced-vae-디코드" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#더-큰-배치를-위한-sliced-vae-디코드"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>더 큰 배치를 위한 sliced VAE 디코드</span></h2> <p data-svelte-h="svelte-1klv9ve">제한된 VRAM에서 대규모 이미지 배치를 디코딩하거나 32개 이상의 이미지가 포함된 배치를 활성화하기 위해, 배치의 latent 이미지를 한 번에 하나씩 디코딩하는 슬라이스 VAE 디코드를 사용할 수 있습니다.</p> <p data-svelte-h="svelte-1bo4p0c">이를 <code>enable_attention_slicing()</code> 또는 <code>enable_xformers_memory_efficient_attention()</code>과 결합하여 메모리 사용을 추가로 최소화할 수 있습니다.</p> <p data-svelte-h="svelte-j8mqed">VAE 디코드를 한 번에 하나씩 수행하려면 추론 전에 파이프라인에서 <code>enable_vae_slicing()</code>을 호출합니다. 예를 들어:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline

	pipe = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>,

	torch_dtype=torch.float16,
	)
	pipe = pipe.to(<span class="hljs-string">"cuda"</span>)

	prompt = <span class="hljs-string">"a photo of an astronaut riding a horse on mars"</span>
	pipe.enable_vae_slicing()
	images = pipe([prompt] * <span class="hljs-number">32</span>).images<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1l99s96">다중 이미지 배치에서 VAE 디코드가 약간의 성능 향상이 이루어집니다. 단일 이미지 배치에서는 성능 영향은 없습니다.</p> <a name="sequential_offloading"></a>
	## 메모리 절약을 위해 가속 기능을 사용하여 CPU로 오프로딩
	<p data-svelte-h="svelte-131mxrq">추가 메모리 절약을 위해 가중치를 CPU로 오프로드하고 순방향 전달을 수행할 때만 GPU로 로드할 수 있습니다.</p> <p data-svelte-h="svelte-f0b7n3">CPU 오프로딩을 수행하려면 <code>enable_sequential_cpu_offload()</code>를 호출하기만 하면 됩니다:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline

	pipe = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>,

	torch_dtype=torch.float16,
	)

	prompt = <span class="hljs-string">"a photo of an astronaut riding a horse on mars"</span>
	pipe.enable_sequential_cpu_offload()
	image = pipe(prompt).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-tablwz">그러면 메모리 소비를 3GB 미만으로 줄일 수 있습니다.</p> <p data-svelte-h="svelte-1tbver6">참고로 이 방법은 전체 모델이 아닌 서브모듈 수준에서 작동합니다. 이는 메모리 소비를 최소화하는 가장 좋은 방법이지만 프로세스의 반복적 특성으로 인해 추론 속도가 훨씬 느립니다. 파이프라인의 UNet 구성 요소는 여러 번 실행됩니다(‘num_inference_steps’ 만큼). 매번 UNet의 서로 다른 서브모듈이 순차적으로 온로드된 다음 필요에 따라 오프로드되므로 메모리 이동 횟수가 많습니다.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400">또 다른 최적화 방법인 <a href="#model_offloading" data-svelte-h="svelte-zbpoyt">모델 오프로딩</a>을 사용하는 것을 고려하십시오. 이는 훨씬 빠르지만 메모리 절약이 크지는 않습니다.</div> <p data-svelte-h="svelte-1hfwhk5">또한 ttention slicing과 연결해서 최소 메모리(< 2GB)로도 동작할 수 있습니다.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline

	pipe = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>,

	torch_dtype=torch.float16,
	)

	prompt = <span class="hljs-string">"a photo of an astronaut riding a horse on mars"</span>
	pipe.enable_sequential_cpu_offload()
	pipe.enable_attention_slicing(<span class="hljs-number">1</span>)

	image = pipe(prompt).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ax3hx7"><strong>참고</strong>: ‘enable_sequential_cpu_offload()‘를 사용할 때, 미리 파이프라인을 CUDA로 이동하지 <strong>않는</strong> 것이 중요합니다.그렇지 않으면 메모리 소비의 이득이 최소화됩니다. 더 많은 정보를 위해 <a href="https://github.com/huggingface/diffusers/issues/1934" rel="nofollow">이 이슈</a>를 보세요.</p> <a name="model_offloading"></a>
	## 빠른 추론과 메모리 메모리 절약을 위한 모델 오프로딩
	<p data-svelte-h="svelte-7dwxx7"><a href="#sequential_offloading">순차적 CPU 오프로딩</a>은 이전 섹션에서 설명한 것처럼 많은 메모리를 보존하지만 필요에 따라 서브모듈을 GPU로 이동하고 새 모듈이 실행될 때 즉시 CPU로 반환되기 때문에 추론 속도가 느려집니다.</p> <p data-svelte-h="svelte-1llz1y7">전체 모델 오프로딩은 각 모델의 구성 요소인 <em>modules</em>을 처리하는 대신, 전체 모델을 GPU로 이동하는 대안입니다. 이로 인해 추론 시간에 미치는 영향은 미미하지만(파이프라인을 ‘cuda’로 이동하는 것과 비교하여) 여전히 약간의 메모리를 절약할 수 있습니다.</p> <p data-svelte-h="svelte-1hmauk">이 시나리오에서는 파이프라인의 주요 구성 요소 중 하나만(일반적으로 텍스트 인코더, unet 및 vae) GPU에 있고, 나머지는 CPU에서 대기할 것입니다.
	여러 반복을 위해 실행되는 UNet과 같은 구성 요소는 더 이상 필요하지 않을 때까지 GPU에 남아 있습니다.</p> <p data-svelte-h="svelte-1gllwmo">이 기능은 아래와 같이 파이프라인에서 <code>enable_model_cpu_offload()</code>를 호출하여 활성화할 수 있습니다.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline

	pipe = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>,
	torch_dtype=torch.float16,
	)

	prompt = <span class="hljs-string">"a photo of an astronaut riding a horse on mars"</span>
	pipe.enable_model_cpu_offload()
	image = pipe(prompt).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-5q5hse">이는 추가적인 메모리 절약을 위한 attention slicing과도 호환됩니다.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline

	pipe = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>,
	torch_dtype=torch.float16,
	)

	prompt = <span class="hljs-string">"a photo of an astronaut riding a horse on mars"</span>
	pipe.enable_model_cpu_offload()
	pipe.enable_attention_slicing(<span class="hljs-number">1</span>)

	image = pipe(prompt).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400">이 기능을 사용하려면 'accelerate' 버전 0.17.0 이상이 필요합니다.</div> <h2 class="relative group"><a id="channels-last-메모리-형식-사용하기" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#channels-last-메모리-형식-사용하기"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Channels Last 메모리 형식 사용하기</span></h2> <p data-svelte-h="svelte-1rrl8zz">Channels Last 메모리 형식은 차원 순서를 보존하는 메모리에서 NCHW 텐서 배열을 대체하는 방법입니다.
	Channels Last 텐서는 채널이 가장 조밀한 차원이 되는 방식으로 정렬됩니다(일명 픽셀당 이미지를 저장).
	현재 모든 연산자 Channels Last 형식을 지원하는 것은 아니라 성능이 저하될 수 있으므로, 사용해보고 모델에 잘 작동하는지 확인하는 것이 좋습니다.</p> <p data-svelte-h="svelte-1c0oa55">예를 들어 파이프라인의 UNet 모델이 channels Last 형식을 사용하도록 설정하려면 다음을 사용할 수 있습니다:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">print</span>(pipe.unet.conv_out.state_dict()[<span class="hljs-string">"weight"</span>].stride()) <span class="hljs-comment"># (2880, 9, 3, 1)</span>
	pipe.unet.to(memory_format=torch.channels_last) <span class="hljs-comment"># in-place 연산</span>
	<span class="hljs-comment"># 2번째 차원에서 스트라이드 1을 가지는 (2880, 1, 960, 320)로, 연산이 작동함을 증명합니다.</span>
	<span class="hljs-built_in">print</span>(pipe.unet.conv_out.state_dict()[<span class="hljs-string">"weight"</span>].stride())<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="추적tracing" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#추적tracing"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>추적(tracing)</span></h2> <p data-svelte-h="svelte-1jkjfr1">추적은 모델을 통해 예제 입력 텐서를 통해 실행되는데, 해당 입력이 모델의 레이어를 통과할 때 호출되는 작업을 캡처하여 실행 파일 또는 ‘ScriptFunction’이 반환되도록 하고, 이는 just-in-time 컴파일로 최적화됩니다.</p> <p data-svelte-h="svelte-mqbplb">UNet 모델을 추적하기 위해 다음을 사용할 수 있습니다:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> time
	<span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline
	<span class="hljs-keyword">import</span> functools

	<span class="hljs-comment"># torch 기울기 비활성화</span>
	torch.set_grad_enabled(<span class="hljs-literal">False</span>)

	<span class="hljs-comment"># 변수 설정</span>
	n_experiments = <span class="hljs-number">2</span>
	unet_runs_per_experiment = <span class="hljs-number">50</span>


	<span class="hljs-comment"># 입력 불러오기</span>
	<span class="hljs-keyword">def</span> <span class="hljs-title function_">generate_inputs</span>():
	sample = torch.randn((<span class="hljs-number">2</span>, <span class="hljs-number">4</span>, <span class="hljs-number">64</span>, <span class="hljs-number">64</span>), device=<span class="hljs-string">"cuda"</span>, dtype=torch.float16)
	timestep = torch.rand(<span class="hljs-number">1</span>, device=<span class="hljs-string">"cuda"</span>, dtype=torch.float16) * <span class="hljs-number">999</span>
	encoder_hidden_states = torch.randn((<span class="hljs-number">2</span>, <span class="hljs-number">77</span>, <span class="hljs-number">768</span>), device=<span class="hljs-string">"cuda"</span>, dtype=torch.float16)
	<span class="hljs-keyword">return</span> sample, timestep, encoder_hidden_states


	pipe = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>,
	torch_dtype=torch.float16,
	).to(<span class="hljs-string">"cuda"</span>)
	unet = pipe.unet
	unet.<span class="hljs-built_in">eval</span>()
	unet.to(memory_format=torch.channels_last) <span class="hljs-comment"># Channels Last 메모리 형식 사용</span>
	unet.forward = functools.partial(unet.forward, return_dict=<span class="hljs-literal">False</span>) <span class="hljs-comment"># return_dict=False을 기본값으로 설정</span>

	<span class="hljs-comment"># 워밍업</span>
	<span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">3</span>):
	<span class="hljs-keyword">with</span> torch.inference_mode():
	inputs = generate_inputs()
	orig_output = unet(*inputs)

	<span class="hljs-comment"># 추적</span>
	<span class="hljs-built_in">print</span>(<span class="hljs-string">"tracing.."</span>)
	unet_traced = torch.jit.trace(unet, inputs)
	unet_traced.<span class="hljs-built_in">eval</span>()
	<span class="hljs-built_in">print</span>(<span class="hljs-string">"done tracing"</span>)


	<span class="hljs-comment"># 워밍업 및 그래프 최적화</span>
	<span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">5</span>):
	<span class="hljs-keyword">with</span> torch.inference_mode():
	inputs = generate_inputs()
	orig_output = unet_traced(*inputs)


	<span class="hljs-comment"># 벤치마킹</span>
	<span class="hljs-keyword">with</span> torch.inference_mode():
	<span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(n_experiments):
	torch.cuda.synchronize()
	start_time = time.time()
	<span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(unet_runs_per_experiment):
	orig_output = unet_traced(*inputs)
	torch.cuda.synchronize()
	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"unet traced inference took <span class="hljs-subst">{time.time() - start_time:<span class="hljs-number">.2</span>f}</span> seconds"</span>)
	<span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(n_experiments):
	torch.cuda.synchronize()
	start_time = time.time()
	<span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(unet_runs_per_experiment):
	orig_output = unet(*inputs)
	torch.cuda.synchronize()
	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"unet inference took <span class="hljs-subst">{time.time() - start_time:<span class="hljs-number">.2</span>f}</span> seconds"</span>)

	<span class="hljs-comment"># 모델 저장</span>
	unet_traced.save(<span class="hljs-string">"unet_traced.pt"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ewkmr2">그 다음, 파이프라인의 <code>unet</code> 특성을 다음과 같이 추적된 모델로 바꿀 수 있습니다.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline
	<span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> dataclasses <span class="hljs-keyword">import</span> dataclass


	<span class="hljs-meta">@dataclass</span>
	<span class="hljs-keyword">class</span> <span class="hljs-title class_">UNet2DConditionOutput</span>:
	sample: torch.Tensor


	pipe = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>,
	torch_dtype=torch.float16,
	).to(<span class="hljs-string">"cuda"</span>)

	<span class="hljs-comment"># jitted unet 사용</span>
	unet_traced = torch.jit.load(<span class="hljs-string">"unet_traced.pt"</span>)


	<span class="hljs-comment"># pipe.unet 삭제</span>
	<span class="hljs-keyword">class</span> <span class="hljs-title class_">TracedUNet</span>(torch.nn.Module):
	<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self</span>):
	<span class="hljs-built_in">super</span>().__init__()
	self.in_channels = pipe.unet.config.in_channels
	self.device = pipe.unet.device

	<span class="hljs-keyword">def</span> <span class="hljs-title function_">forward</span>(<span class="hljs-params">self, latent_model_input, t, encoder_hidden_states</span>):
	sample = unet_traced(latent_model_input, t, encoder_hidden_states)[<span class="hljs-number">0</span>]
	<span class="hljs-keyword">return</span> UNet2DConditionOutput(sample=sample)


	pipe.unet = TracedUNet()

	<span class="hljs-keyword">with</span> torch.inference_mode():
	image = pipe([prompt] * <span class="hljs-number">1</span>, num_inference_steps=<span class="hljs-number">50</span>).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="memory-efficient-attention" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#memory-efficient-attention"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Memory-efficient attention</span></h2> <p data-svelte-h="svelte-1lkmvkn">어텐션 블록의 대역폭을 최적화하는 최근 작업으로 GPU 메모리 사용량이 크게 향상되고 향상되었습니다.
	@tridao의 가장 최근의 플래시 어텐션: <a href="https://github.com/HazyResearch/flash-attention" rel="nofollow">code</a>, <a href="https://arxiv.org/pdf/2205.14135.pdf" rel="nofollow">paper</a>.</p> <p data-svelte-h="svelte-1aa24j0">배치 크기 1(프롬프트 1개)의 512x512 크기로 추론을 실행할 때 몇 가지 Nvidia GPU에서 얻은 속도 향상은 다음과 같습니다:</p> <table data-svelte-h="svelte-13acbqe"><thead><tr><th>GPU</th> <th>기준 어텐션 FP16</th> <th>메모리 효율적인 어텐션 FP16</th></tr></thead> <tbody><tr><td>NVIDIA Tesla T4</td> <td>3.5it/s</td> <td>5.5it/s</td></tr> <tr><td>NVIDIA 3060 RTX</td> <td>4.6it/s</td> <td>7.8it/s</td></tr> <tr><td>NVIDIA A10G</td> <td>8.88it/s</td> <td>15.6it/s</td></tr> <tr><td>NVIDIA RTX A6000</td> <td>11.7it/s</td> <td>21.09it/s</td></tr> <tr><td>NVIDIA TITAN RTX</td> <td>12.51it/s</td> <td>18.22it/s</td></tr> <tr><td>A100-SXM4-40GB</td> <td>18.6it/s</td> <td>29.it/s</td></tr> <tr><td>A100-SXM-80GB</td> <td>18.7it/s</td> <td>29.5it/s</td></tr></tbody></table> <p data-svelte-h="svelte-j9132p">이를 활용하려면 다음을 만족해야 합니다:</p> <ul data-svelte-h="svelte-gqxwyg"><li>PyTorch > 1.12</li> <li>Cuda 사용 가능</li> <li><a href="xformers">xformers 라이브러리를 설치함</a></li></ul> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline
	<span class="hljs-keyword">import</span> torch

	pipe = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>,
	torch_dtype=torch.float16,
	).to(<span class="hljs-string">"cuda"</span>)

	pipe.enable_xformers_memory_efficient_attention()

	<span class="hljs-keyword">with</span> torch.inference_mode():
	sample = pipe(<span class="hljs-string">"a small cat"</span>)

	<span class="hljs-comment"># 선택: 이를 비활성화 하기 위해 다음을 사용할 수 있습니다.</span>
	<span class="hljs-comment"># pipe.disable_xformers_memory_efficient_attention()</span><!-- HTML_TAG_END --></pre></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/diffusers/blob/main/docs/source/ko/optimization/fp16.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_zj8ija = {
	assets: "/docs/diffusers/pr_11234/ko",
	base: "/docs/diffusers/pr_11234/ko",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/diffusers/pr_11234/ko/_app/immutable/entry/start.17c9338f.js"),
	import("/docs/diffusers/pr_11234/ko/_app/immutable/entry/app.0ba20c84.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 11],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 56.8 kB
Xet hash:: c3bc3ea1cf434a91ecb56a04c4694c0cffd92e8da4e465f0421487ce312adb23

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.