Buckets:
| import{s as Be,o as Ye,n as Le}from"../chunks/scheduler.852ec091.js";import{S as He,i as Fe,g as p,s as a,r as d,A as Ee,h as r,f as s,c as i,j as Ie,u as b,x as o,k as fe,y as Qe,a as l,v as c,d as M,t as u,w as y}from"../chunks/index.28275fd3.js";import{T as Ae}from"../chunks/Tip.9f398c59.js";import{C as S}from"../chunks/CodeBlock.c3366071.js";import{H as $e,E as Se}from"../chunks/EditOnGithub.582011f0.js";function Xe(X){let n,h="8-bit optimizers reduce memory usage and accelerate optimization on a wide range of tasks. However, since 8-bit optimizers only reduce memory proportional to the number of parameters, models that use large amounts of activation memory, such as convolutional networks, don’t really benefit from 8-bit optimizers. 8-bit optimizers are most beneficial for training or finetuning models with many parameters on highly memory-constrained GPUs.";return{c(){n=p("p"),n.textContent=h},l(m){n=r(m,"P",{"data-svelte-h":!0}),o(n)!=="svelte-q5gilt"&&(n.textContent=h)},m(m,w){l(m,n,w)},p:Le,d(m){m&&s(n)}}}function Re(X){let n,h="Check the optimizer API documentation for more information about other hyperparameters you can override.";return{c(){n=p("p"),n.textContent=h},l(m){n=r(m,"P",{"data-svelte-h":!0}),o(n)!=="svelte-g9htrg"&&(n.textContent=h)},m(m,w){l(m,n,w)},p:Le,d(m){m&&s(n)}}}function Ve(X){let n,h,m,w,J,V,T,je="With 8-bit optimizers, large models can be finetuned with 75% less GPU memory without losing any accuracy compared to training with standard 32-bit optimizers. The reduced memory requirements means 8-bit optimizers are 4x faster than a standard optimizer, and no hyperparameter tuning is required.",N,g,Je="This guide will show you how to use 8-bit optimizers.",P,f,K,U,Te='8-bit optimizers are a drop-in replacement for regular optimizers which means they also accept the same arguments as a regular optimizer. For NLP models, it is recommended to use the <a href="/docs/bitsandbytes/pr_1501/en/reference/nn/embeddings#bitsandbytes.nn.StableEmbedding">StableEmbedding</a> class to improve stability and results.',O,G,D,_,ge="By default, all parameter tensors with less than 4096 elements are kept at 32-bits even if you initialize those parameters with 8-bit optimizers. This is done because small tensors do not save much memory and often contain highly variable parameters (biases) or parameters that require high precision (batch norm, layer norm).",q,v,Ue="You can change this value with the <code>min_8bit_size</code> parameter. For example, if you want to optimize parameters to 8-bits only if the minimum size is 16384 values (it is recommended to use multiples of 4096):",ee,x,te,Z,Ge='Other parameters you can configure include the learning rate (<code>lr</code>), the decay rates (<code>betas</code>), the number of bits of the optimizer state (<code>optim_bits</code>), and percentile clipping (<code>percentile_clipping</code>) which can increase stability. For example, to initialize a 32-bit <a href="/docs/bitsandbytes/pr_1501/en/reference/optim/adam#bitsandbytes.optim.Adam">Adam</a> optimizer with 5th percentile clipping:',se,z,le,W,ae,k,_e='To optimize some unstable parameters with 32-bit Adam and others with 8-bit Adam, use the <a href="/docs/bitsandbytes/pr_1501/en/reference/optim/optim_overview#bitsandbytes.optim.GlobalOptimManager">GlobalOptimManager</a> class to override the specific hyperparameters for a particular layer. You’ll need to:',ie,C,ve="<li>Register the parameters while they’re on the CPU.</li>",ne,I,me,$,xe="<li>Override the config with the new desired hyperparameters. For example, let’s override the <code>model.fc1.weight</code> layer to use 32-bit Adam.</li>",pe,j,re,A,oe,L,Ze="You can also override multiple layers at once by passing them as a list and the new hyperparameters as a dictionary. For example, let’s override the <code>model.special.weight</code> and <code>model.also_special.weight</code> layers to use sparse optimization and a lower learning and decay rate.",de,B,be,Y,ze='For a specific layer, we recommend overriding locally in each module. Pass the module, the parameter, and its attribute name to the <a href="/docs/bitsandbytes/pr_1501/en/reference/optim/optim_overview#bitsandbytes.optim.GlobalOptimManager">GlobalOptimManager</a>:',ce,H,Me,F,ue,E,We='For more conceptual details and explanation about 8-bit optimizers, take a look at the <a href="./explanations/optimizers">8-bit optimizers</a> guide.',ye,Q,he,R,we;return J=new $e({props:{title:"8-bit optimizers",local:"8-bit-optimizers",headingTag:"h1"}}),f=new Ae({props:{warning:!0,$$slots:{default:[Xe]},$$scope:{ctx:X}}}),G=new S({props:{code:"aW1wb3J0JTIwYml0c2FuZGJ5dGVzJTIwYXMlMjBibmIlMEElMEEtJTIwYWRhbSUyMCUzRCUyMHRvcmNoLm9wdGltLkFkYW0oLi4uKSUwQSUyQiUyMGFkYW0lMjAlM0QlMjBibmIub3B0aW0uQWRhbThiaXQoLi4uKSUwQSUwQSUyMyUyMHJlY29tbWVuZGVkJTIwZm9yJTIwTkxQJTIwbW9kZWxzJTBBLSUyMGJlZm9yZSUzQSUyMHRvcmNoLm5uLkVtYmVkZGluZyguLi4pJTBBJTJCJTIwYm5iLm5uLlN0YWJsZUVtYmVkZGluZyguLi4p",highlighted:`import bitsandbytes as bnb | |
| <span class="hljs-deletion">- adam = torch.optim.Adam(...)</span> | |
| <span class="hljs-addition">+ adam = bnb.optim.Adam8bit(...)</span> | |
| # recommended for NLP models | |
| <span class="hljs-deletion">- before: torch.nn.Embedding(...)</span> | |
| <span class="hljs-addition">+ bnb.nn.StableEmbedding(...)</span>`,wrap:!1}}),x=new S({props:{code:"aW1wb3J0JTIwYml0c2FuZGJ5dGVzJTIwYXMlMjBibmIlMEElMEFhZGFtJTIwJTNEJTIwYm5iLm9wdGltLkFkYW04Yml0KG1vZGVsLnBhcmFtZXRlcnMoKSUyQyUyMG1pbl84Yml0X3NpemUlM0QxNjM4NCk=",highlighted:`<span class="hljs-keyword">import</span> bitsandbytes <span class="hljs-keyword">as</span> bnb | |
| adam = bnb.optim.Adam8bit(model.parameters(), min_8bit_size=<span class="hljs-number">16384</span>)`,wrap:!1}}),z=new S({props:{code:"aW1wb3J0JTIwYml0c2FuZGJ5dGVzJTIwYXMlMjBibmIlMEElMEFhZGFtJTIwJTNEJTIwYm5iLm9wdGltLkFkYW0obW9kZWwucGFyYW1ldGVycygpJTJDJTIwbHIlM0QwLjAwMSUyQyUyMGJldGFzJTNEKDAuOSUyQyUyMDAuOTk1KSUyQyUyMG9wdGltX2JpdHMlM0QzMiUyQyUyMHBlcmNlbnRpbGVfY2xpcHBpbmclM0Q1KQ==",highlighted:`<span class="hljs-keyword">import</span> bitsandbytes <span class="hljs-keyword">as</span> bnb | |
| adam = bnb.optim.Adam(model.parameters(), lr=<span class="hljs-number">0.001</span>, betas=(<span class="hljs-number">0.9</span>, <span class="hljs-number">0.995</span>), optim_bits=<span class="hljs-number">32</span>, percentile_clipping=<span class="hljs-number">5</span>)`,wrap:!1}}),W=new $e({props:{title:"Optimize unstable parameters",local:"optimize-unstable-parameters",headingTag:"h2"}}),I=new S({props:{code:"aW1wb3J0JTIwdG9yY2glMEFpbXBvcnQlMjBiaXRzYW5kYnl0ZXMlMjBhcyUyMGJuYiUwQSUwQW1uZyUyMCUzRCUyMGJuYi5vcHRpbS5HbG9iYWxPcHRpbU1hbmFnZXIuZ2V0X2luc3RhbmNlKCklMEElMEFtb2RlbCUyMCUzRCUyME15TW9kZWwoKSUwQW1uZy5yZWdpc3Rlcl9wYXJhbWV0ZXJzKG1vZGVsLnBhcmFtZXRlcnMoKSk=",highlighted:`<span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">import</span> bitsandbytes <span class="hljs-keyword">as</span> bnb | |
| mng = bnb.optim.GlobalOptimManager.get_instance() | |
| model = MyModel() | |
| mng.register_parameters(model.parameters())`,wrap:!1}}),j=new Ae({props:{warning:!1,$$slots:{default:[Re]},$$scope:{ctx:X}}}),A=new S({props:{code:"bW9kZWwlMjAlM0QlMjBtb2RlbC5jdWRhKCklMEElMjMlMjB1c2UlMjA4LWJpdCUyMG9wdGltaXplciUyMHN0YXRlcyUyMGZvciUyMGFsbCUyMHBhcmFtZXRlcnMlMEFhZGFtJTIwJTNEJTIwYm5iLm9wdGltLkFkYW0obW9kZWwucGFyYW1ldGVycygpJTJDJTIwbHIlM0QwLjAwMSUyQyUyMG9wdGltX2JpdHMlM0Q4KSUwQSUwQSUyMyUyMG92ZXJyaWRlJTIwdGhlJTIwcGFyYW1ldGVyJTIwbW9kZWwuZmMxLndlaWdodCUyMG5vdyUyMHVzZXMlMjAzMi1iaXQlMjBBZGFtJTBBbW5nLm92ZXJyaWRlX2NvbmZpZyhtb2RlbC5mYzEud2VpZ2h0JTJDJTIwJTIyb3B0aW1fYml0cyUyMiUyQyUyMDMyKQ==",highlighted:`model = model.cuda() | |
| <span class="hljs-comment"># use 8-bit optimizer states for all parameters</span> | |
| adam = bnb.optim.Adam(model.parameters(), lr=<span class="hljs-number">0.001</span>, optim_bits=<span class="hljs-number">8</span>) | |
| <span class="hljs-comment"># override the parameter model.fc1.weight now uses 32-bit Adam</span> | |
| mng.override_config(model.fc1.weight, <span class="hljs-string">"optim_bits"</span>, <span class="hljs-number">32</span>)`,wrap:!1}}),B=new S({props:{code:"bW5nLm92ZXJyaWRlX2NvbmZpZyglNUJtb2RlbC5zcGVjaWFsLndlaWdodCUyQyUyMG1vZGVsLmFsc29fc3BlY2lhbC53ZWlnaHQlNUQlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBrZXlfdmFsdWVfZGljdCUyMCUzRCU3Qidpc19zcGFyc2UnJTNBJTIwVHJ1ZSUyQyUyMCdsciclM0ElMjAxZS01JTJDJTIwJ2JldGFzJyUzRCgwLjklMkMlMjAwLjk4KSU3RCk=",highlighted:`mng.override_config([model.special.weight, model.also_special.weight], | |
| key_value_dict ={<span class="hljs-string">'is_sparse'</span>: <span class="hljs-literal">True</span>, <span class="hljs-string">'lr'</span>: <span class="hljs-number">1e-5</span>, <span class="hljs-string">'betas'</span>=(<span class="hljs-number">0.9</span>, <span class="hljs-number">0.98</span>)})`,wrap:!1}}),H=new S({props:{code:"Y2xhc3MlMjBNeU1vZHVsZSh0b3JjaC5ubi5Nb2R1bGUpJTNBJTBBJTIwJTIwZGVmJTIwX19pbml0X18oZF9pbiUyQyUyMGRfb3V0KSUzQSUwQSUyMCUyMCUyMCUyMHN1cGVyKE15TW9kdWxlJTJDJTIwc2VsZikuX19pbml0X18oKSUwQSUyMCUyMCUyMCUyMHNlbGYubGluZWFyJTIwJTNEJTIwdG9yY2gubm4uTGluZWFyKGRfaW4lMkMlMjBkX291dCklMEElMjAlMjAlMjAlMjAlMjMlMjBvcHRpbWl6YXRpb24lMjB3aWxsJTIwaGFwcGVuJTIwaW4lMjAzMi1iaXQlMjBhbmQlMEElMjAlMjAlMjAlMjAlMjMlMjBsZWFybmluZyUyMHJhdGUlMjB3aWxsJTIwYmUlMjBzZXQlMjB0byUyMDAuMDAwMSUyMGluZGVwZW5kZW50JTIwb2YlMjB0aGUlMjBtYWluJTIwbGVhcm5pbmclMjByYXRlJTBBJTIwJTIwJTIwJTIwY29uZmlnJTIwJTNEJTIwJTdCJ29wdGltX2JpdHMnJTNBJTIwMzIlMkMlMjAnbHInJTIwJTNBJTIwMC4wMDAxJTdEJTBBJTIwJTIwJTIwJTIwR2xvYmFsT3B0aW1NYW5hZ2VyLmdldF9pbnN0YW5jZSgpLnJlZ2lzdGVyX21vZHVsZV9vdmVycmlkZShzZWxmJTJDJTIwJ3dlaWdodCclMkMlMjBjb25maWcpJTBB",highlighted:`<span class="hljs-keyword">class</span> <span class="hljs-title class_">MyModule</span>(torch.nn.Module): | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">d_in, d_out</span>): | |
| <span class="hljs-built_in">super</span>(MyModule, self).__init__() | |
| self.linear = torch.nn.Linear(d_in, d_out) | |
| <span class="hljs-comment"># optimization will happen in 32-bit and</span> | |
| <span class="hljs-comment"># learning rate will be set to 0.0001 independent of the main learning rate</span> | |
| config = {<span class="hljs-string">'optim_bits'</span>: <span class="hljs-number">32</span>, <span class="hljs-string">'lr'</span> : <span class="hljs-number">0.0001</span>} | |
| GlobalOptimManager.get_instance().register_module_override(self, <span class="hljs-string">'weight'</span>, config) | |
| `,wrap:!1}}),F=new $e({props:{title:"Next steps",local:"next-steps",headingTag:"h2"}}),Q=new Se({props:{source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/main/docs/source/optimizers.mdx"}}),{c(){n=p("meta"),h=a(),m=p("p"),w=a(),d(J.$$.fragment),V=a(),T=p("p"),T.textContent=je,N=a(),g=p("p"),g.textContent=Je,P=a(),d(f.$$.fragment),K=a(),U=p("p"),U.innerHTML=Te,O=a(),d(G.$$.fragment),D=a(),_=p("p"),_.textContent=ge,q=a(),v=p("p"),v.innerHTML=Ue,ee=a(),d(x.$$.fragment),te=a(),Z=p("p"),Z.innerHTML=Ge,se=a(),d(z.$$.fragment),le=a(),d(W.$$.fragment),ae=a(),k=p("p"),k.innerHTML=_e,ie=a(),C=p("ol"),C.innerHTML=ve,ne=a(),d(I.$$.fragment),me=a(),$=p("ol"),$.innerHTML=xe,pe=a(),d(j.$$.fragment),re=a(),d(A.$$.fragment),oe=a(),L=p("p"),L.innerHTML=Ze,de=a(),d(B.$$.fragment),be=a(),Y=p("p"),Y.innerHTML=ze,ce=a(),d(H.$$.fragment),Me=a(),d(F.$$.fragment),ue=a(),E=p("p"),E.innerHTML=We,ye=a(),d(Q.$$.fragment),he=a(),R=p("p"),this.h()},l(e){const t=Ee("svelte-u9bgzb",document.head);n=r(t,"META",{name:!0,content:!0}),t.forEach(s),h=i(e),m=r(e,"P",{}),Ie(m).forEach(s),w=i(e),b(J.$$.fragment,e),V=i(e),T=r(e,"P",{"data-svelte-h":!0}),o(T)!=="svelte-7xifi0"&&(T.textContent=je),N=i(e),g=r(e,"P",{"data-svelte-h":!0}),o(g)!=="svelte-anq6iu"&&(g.textContent=Je),P=i(e),b(f.$$.fragment,e),K=i(e),U=r(e,"P",{"data-svelte-h":!0}),o(U)!=="svelte-ebsagl"&&(U.innerHTML=Te),O=i(e),b(G.$$.fragment,e),D=i(e),_=r(e,"P",{"data-svelte-h":!0}),o(_)!=="svelte-1swxts3"&&(_.textContent=ge),q=i(e),v=r(e,"P",{"data-svelte-h":!0}),o(v)!=="svelte-swi3zx"&&(v.innerHTML=Ue),ee=i(e),b(x.$$.fragment,e),te=i(e),Z=r(e,"P",{"data-svelte-h":!0}),o(Z)!=="svelte-1e3vnwa"&&(Z.innerHTML=Ge),se=i(e),b(z.$$.fragment,e),le=i(e),b(W.$$.fragment,e),ae=i(e),k=r(e,"P",{"data-svelte-h":!0}),o(k)!=="svelte-1khwo81"&&(k.innerHTML=_e),ie=i(e),C=r(e,"OL",{"data-svelte-h":!0}),o(C)!=="svelte-1revex8"&&(C.innerHTML=ve),ne=i(e),b(I.$$.fragment,e),me=i(e),$=r(e,"OL",{start:!0,"data-svelte-h":!0}),o($)!=="svelte-127zgse"&&($.innerHTML=xe),pe=i(e),b(j.$$.fragment,e),re=i(e),b(A.$$.fragment,e),oe=i(e),L=r(e,"P",{"data-svelte-h":!0}),o(L)!=="svelte-kxfpon"&&(L.innerHTML=Ze),de=i(e),b(B.$$.fragment,e),be=i(e),Y=r(e,"P",{"data-svelte-h":!0}),o(Y)!=="svelte-jxx0wx"&&(Y.innerHTML=ze),ce=i(e),b(H.$$.fragment,e),Me=i(e),b(F.$$.fragment,e),ue=i(e),E=r(e,"P",{"data-svelte-h":!0}),o(E)!=="svelte-1er8qov"&&(E.innerHTML=We),ye=i(e),b(Q.$$.fragment,e),he=i(e),R=r(e,"P",{}),Ie(R).forEach(s),this.h()},h(){fe(n,"name","hf:doc:metadata"),fe(n,"content",Ne),fe($,"start","2")},m(e,t){Qe(document.head,n),l(e,h,t),l(e,m,t),l(e,w,t),c(J,e,t),l(e,V,t),l(e,T,t),l(e,N,t),l(e,g,t),l(e,P,t),c(f,e,t),l(e,K,t),l(e,U,t),l(e,O,t),c(G,e,t),l(e,D,t),l(e,_,t),l(e,q,t),l(e,v,t),l(e,ee,t),c(x,e,t),l(e,te,t),l(e,Z,t),l(e,se,t),c(z,e,t),l(e,le,t),c(W,e,t),l(e,ae,t),l(e,k,t),l(e,ie,t),l(e,C,t),l(e,ne,t),c(I,e,t),l(e,me,t),l(e,$,t),l(e,pe,t),c(j,e,t),l(e,re,t),c(A,e,t),l(e,oe,t),l(e,L,t),l(e,de,t),c(B,e,t),l(e,be,t),l(e,Y,t),l(e,ce,t),c(H,e,t),l(e,Me,t),c(F,e,t),l(e,ue,t),l(e,E,t),l(e,ye,t),c(Q,e,t),l(e,he,t),l(e,R,t),we=!0},p(e,[t]){const ke={};t&2&&(ke.$$scope={dirty:t,ctx:e}),f.$set(ke);const Ce={};t&2&&(Ce.$$scope={dirty:t,ctx:e}),j.$set(Ce)},i(e){we||(M(J.$$.fragment,e),M(f.$$.fragment,e),M(G.$$.fragment,e),M(x.$$.fragment,e),M(z.$$.fragment,e),M(W.$$.fragment,e),M(I.$$.fragment,e),M(j.$$.fragment,e),M(A.$$.fragment,e),M(B.$$.fragment,e),M(H.$$.fragment,e),M(F.$$.fragment,e),M(Q.$$.fragment,e),we=!0)},o(e){u(J.$$.fragment,e),u(f.$$.fragment,e),u(G.$$.fragment,e),u(x.$$.fragment,e),u(z.$$.fragment,e),u(W.$$.fragment,e),u(I.$$.fragment,e),u(j.$$.fragment,e),u(A.$$.fragment,e),u(B.$$.fragment,e),u(H.$$.fragment,e),u(F.$$.fragment,e),u(Q.$$.fragment,e),we=!1},d(e){e&&(s(h),s(m),s(w),s(V),s(T),s(N),s(g),s(P),s(K),s(U),s(O),s(D),s(_),s(q),s(v),s(ee),s(te),s(Z),s(se),s(le),s(ae),s(k),s(ie),s(C),s(ne),s(me),s($),s(pe),s(re),s(oe),s(L),s(de),s(be),s(Y),s(ce),s(Me),s(ue),s(E),s(ye),s(he),s(R)),s(n),y(J,e),y(f,e),y(G,e),y(x,e),y(z,e),y(W,e),y(I,e),y(j,e),y(A,e),y(B,e),y(H,e),y(F,e),y(Q,e)}}}const Ne='{"title":"8-bit optimizers","local":"8-bit-optimizers","sections":[{"title":"Optimize unstable parameters","local":"optimize-unstable-parameters","sections":[],"depth":2},{"title":"Next steps","local":"next-steps","sections":[],"depth":2}],"depth":1}';function Pe(X){return Ye(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class tt extends He{constructor(n){super(),Fe(this,n,Pe,Ve,Be,{})}}export{tt as component}; | |
Xet Storage Details
- Size:
- 15.1 kB
- Xet hash:
- 09000ad1d46c766f2088113d39fd420de464176f2fd1f30efa961d702cedb026
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.