Buckets:

download
raw
19 kB
import{s as Ae,n as xe,o as Le}from"../chunks/scheduler.852ec091.js";import{S as we,i as Me,g as s,s as i,r as d,A as Be,h as r,f as t,c as o,j as A,u as l,x as V,k as x,y as m,a as n,v as c,d as p,t as b,w as _}from"../chunks/index.28275fd3.js";import{D as S}from"../chunks/Docstring.ee6c313e.js";import{H as de,E as Te}from"../chunks/EditOnGithub.582011f0.js";function ze(he){let g,G,H,R,L,U,w,ue='<a href="https://hf.co/papers/1904.00962" rel="nofollow">LAMB (Layerwise adaptive large batch optimization)</a> is an adaptive optimizer designed for training with large batch sizes to accelerate training, combining ideas from <code>LARS</code> and <code>Adam</code> to automatically scale the learning rate for each layer:',O,M,fe="<li>calculates a <em>trust ratio</em> between the weight and gradient norm in a layer and clips the ratio to prevent overly large or small updates</li> <li>updates weights with the first and second-moments</li>",J,B,K,h,T,le,v,z,ce,q,ve="Base LAMB optimizer.",Q,k,X,u,F,pe,y,W,be,N,ye="8-bit LAMB optimizer.",Y,C,Z,f,E,_e,$,D,ge,j,$e="32-bit LAMB optimizer.",ee,P,te,I,ae;return L=new de({props:{title:"LAMB",local:"lamb",headingTag:"h1"}}),B=new de({props:{title:"LAMB",local:"api-class ][ bitsandbytes.optim.LAMB",headingTag:"h2"}}),T=new S({props:{name:"class bitsandbytes.optim.LAMB",anchor:"bitsandbytes.optim.LAMB",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"bias_correction",val:" = True"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0"},{name:"amsgrad",val:" = False"},{name:"adam_w_mode",val:" = True"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = False"},{name:"max_unorm",val:" = 1.0"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1532/bitsandbytes/optim/lamb.py#L8"}}),z=new S({props:{name:"__init__",anchor:"bitsandbytes.optim.LAMB.__init__",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"bias_correction",val:" = True"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0"},{name:"amsgrad",val:" = False"},{name:"adam_w_mode",val:" = True"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = False"},{name:"max_unorm",val:" = 1.0"}],parametersDescription:[{anchor:"bitsandbytes.optim.LAMB.__init__.params",description:`<strong>params</strong> (<code>torch.tensor</code>) &#x2014;
The input parameters to optimize.`,name:"params"},{anchor:"bitsandbytes.optim.LAMB.__init__.lr",description:`<strong>lr</strong> (<code>float</code>, defaults to 1e-3) &#x2014;
The learning rate.`,name:"lr"},{anchor:"bitsandbytes.optim.LAMB.__init__.bias_correction",description:`<strong>bias_correction</strong> (<code>bool</code>, defaults to <code>True</code>) &#x2014;
Whether to apply bias correction to the first and second-order moments.`,name:"bias_correction"},{anchor:"bitsandbytes.optim.LAMB.__init__.betas",description:`<strong>betas</strong> (<code>tuple(float, float)</code>, defaults to (0.9, 0.999)) &#x2014;
The beta values are the decay rates of the first and second-order moment of the optimizer.`,name:"betas"},{anchor:"bitsandbytes.optim.LAMB.__init__.eps",description:`<strong>eps</strong> (<code>float</code>, defaults to 1e-8) &#x2014;
The epsilon value prevents division by zero in the optimizer.`,name:"eps"},{anchor:"bitsandbytes.optim.LAMB.__init__.weight_decay",description:`<strong>weight_decay</strong> (<code>float</code>, defaults to 1e-2) &#x2014;
The weight decay value for the optimizer.`,name:"weight_decay"},{anchor:"bitsandbytes.optim.LAMB.__init__.amsgrad",description:`<strong>amsgrad</strong> (<code>bool</code>, defaults to <code>False</code>) &#x2014;
Whether to use the <a href="https://hf.co/papers/1904.09237" rel="nofollow">AMSGrad</a> variant of Adam that uses the maximum of past squared gradients instead.`,name:"amsgrad"},{anchor:"bitsandbytes.optim.LAMB.__init__.adam_w_mode",description:`<strong>adam_w_mode</strong> (<code>bool</code>, defaults to <code>True</code>) &#x2014;
Whether to use the AdamW variant.`,name:"adam_w_mode"},{anchor:"bitsandbytes.optim.LAMB.__init__.optim_bits",description:`<strong>optim_bits</strong> (<code>int</code>, defaults to 32) &#x2014;
The number of bits of the optimizer state.`,name:"optim_bits"},{anchor:"bitsandbytes.optim.LAMB.__init__.args",description:`<strong>args</strong> (<code>object</code>, defaults to <code>None</code>) &#x2014;
An object with additional arguments.`,name:"args"},{anchor:"bitsandbytes.optim.LAMB.__init__.min_8bit_size",description:`<strong>min_8bit_size</strong> (<code>int</code>, defaults to 4096) &#x2014;
The minimum number of elements of the parameter tensors for 8-bit optimization.`,name:"min_8bit_size"},{anchor:"bitsandbytes.optim.LAMB.__init__.percentile_clipping",description:`<strong>percentile_clipping</strong> (<code>int</code>, defaults to 100) &#x2014;
Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.`,name:"percentile_clipping"},{anchor:"bitsandbytes.optim.LAMB.__init__.block_wise",description:`<strong>block_wise</strong> (<code>bool</code>, defaults to <code>True</code>) &#x2014;
Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.`,name:"block_wise"},{anchor:"bitsandbytes.optim.LAMB.__init__.max_unorm",description:`<strong>max_unorm</strong> (<code>float</code>, defaults to 1.0) &#x2014;
The maximum gradient norm.`,name:"max_unorm"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1532/bitsandbytes/optim/lamb.py#L9"}}),k=new de({props:{title:"LAMB8bit",local:"bitsandbytes.optim.LAMB8bit",headingTag:"h2"}}),F=new S({props:{name:"class bitsandbytes.optim.LAMB8bit",anchor:"bitsandbytes.optim.LAMB8bit",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"bias_correction",val:" = True"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0"},{name:"amsgrad",val:" = False"},{name:"adam_w_mode",val:" = True"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = False"},{name:"max_unorm",val:" = 1.0"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1532/bitsandbytes/optim/lamb.py#L75"}}),W=new S({props:{name:"__init__",anchor:"bitsandbytes.optim.LAMB8bit.__init__",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"bias_correction",val:" = True"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0"},{name:"amsgrad",val:" = False"},{name:"adam_w_mode",val:" = True"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = False"},{name:"max_unorm",val:" = 1.0"}],parametersDescription:[{anchor:"bitsandbytes.optim.LAMB8bit.__init__.params",description:`<strong>params</strong> (<code>torch.tensor</code>) &#x2014;
The input parameters to optimize.`,name:"params"},{anchor:"bitsandbytes.optim.LAMB8bit.__init__.lr",description:`<strong>lr</strong> (<code>float</code>, defaults to 1e-3) &#x2014;
The learning rate.`,name:"lr"},{anchor:"bitsandbytes.optim.LAMB8bit.__init__.bias_correction",description:`<strong>bias_correction</strong> (<code>bool</code>, defaults to <code>True</code>) &#x2014;
Whether to apply bias correction to the first and second-order moments.`,name:"bias_correction"},{anchor:"bitsandbytes.optim.LAMB8bit.__init__.betas",description:`<strong>betas</strong> (<code>tuple(float, float)</code>, defaults to (0.9, 0.999)) &#x2014;
The beta values are the decay rates of the first and second-order moment of the optimizer.`,name:"betas"},{anchor:"bitsandbytes.optim.LAMB8bit.__init__.eps",description:`<strong>eps</strong> (<code>float</code>, defaults to 1e-8) &#x2014;
The epsilon value prevents division by zero in the optimizer.`,name:"eps"},{anchor:"bitsandbytes.optim.LAMB8bit.__init__.weight_decay",description:`<strong>weight_decay</strong> (<code>float</code>, defaults to 1e-2) &#x2014;
The weight decay value for the optimizer.`,name:"weight_decay"},{anchor:"bitsandbytes.optim.LAMB8bit.__init__.amsgrad",description:`<strong>amsgrad</strong> (<code>bool</code>, defaults to <code>False</code>) &#x2014;
Whether to use the <a href="https://hf.co/papers/1904.09237" rel="nofollow">AMSGrad</a> variant of Adam that uses the maximum of past squared gradients instead.`,name:"amsgrad"},{anchor:"bitsandbytes.optim.LAMB8bit.__init__.adam_w_mode",description:`<strong>adam_w_mode</strong> (<code>bool</code>, defaults to <code>True</code>) &#x2014;
Whether to use the AdamW variant.`,name:"adam_w_mode"},{anchor:"bitsandbytes.optim.LAMB8bit.__init__.args",description:`<strong>args</strong> (<code>object</code>, defaults to <code>None</code>) &#x2014;
An object with additional arguments.`,name:"args"},{anchor:"bitsandbytes.optim.LAMB8bit.__init__.min_8bit_size",description:`<strong>min_8bit_size</strong> (<code>int</code>, defaults to 4096) &#x2014;
The minimum number of elements of the parameter tensors for 8-bit optimization.`,name:"min_8bit_size"},{anchor:"bitsandbytes.optim.LAMB8bit.__init__.percentile_clipping",description:`<strong>percentile_clipping</strong> (<code>int</code>, defaults to 100) &#x2014;
Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.`,name:"percentile_clipping"},{anchor:"bitsandbytes.optim.LAMB8bit.__init__.block_wise",description:`<strong>block_wise</strong> (<code>bool</code>, defaults to <code>True</code>) &#x2014;
Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.`,name:"block_wise"},{anchor:"bitsandbytes.optim.LAMB8bit.__init__.max_unorm",description:`<strong>max_unorm</strong> (<code>float</code>, defaults to 1.0) &#x2014;
The maximum gradient norm.`,name:"max_unorm"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1532/bitsandbytes/optim/lamb.py#L76"}}),C=new de({props:{title:"LAMB32bit",local:"bitsandbytes.optim.LAMB32bit",headingTag:"h2"}}),E=new S({props:{name:"class bitsandbytes.optim.LAMB32bit",anchor:"bitsandbytes.optim.LAMB32bit",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"bias_correction",val:" = True"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0"},{name:"amsgrad",val:" = False"},{name:"adam_w_mode",val:" = True"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = False"},{name:"max_unorm",val:" = 1.0"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1532/bitsandbytes/optim/lamb.py#L139"}}),D=new S({props:{name:"__init__",anchor:"bitsandbytes.optim.LAMB32bit.__init__",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"bias_correction",val:" = True"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0"},{name:"amsgrad",val:" = False"},{name:"adam_w_mode",val:" = True"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = False"},{name:"max_unorm",val:" = 1.0"}],parametersDescription:[{anchor:"bitsandbytes.optim.LAMB32bit.__init__.params",description:`<strong>params</strong> (<code>torch.tensor</code>) &#x2014;
The input parameters to optimize.`,name:"params"},{anchor:"bitsandbytes.optim.LAMB32bit.__init__.lr",description:`<strong>lr</strong> (<code>float</code>, defaults to 1e-3) &#x2014;
The learning rate.`,name:"lr"},{anchor:"bitsandbytes.optim.LAMB32bit.__init__.bias_correction",description:`<strong>bias_correction</strong> (<code>bool</code>, defaults to <code>True</code>) &#x2014;
Whether to apply bias correction to the first and second-order moments.`,name:"bias_correction"},{anchor:"bitsandbytes.optim.LAMB32bit.__init__.betas",description:`<strong>betas</strong> (<code>tuple(float, float)</code>, defaults to (0.9, 0.999)) &#x2014;
The beta values are the decay rates of the first and second-order moment of the optimizer.`,name:"betas"},{anchor:"bitsandbytes.optim.LAMB32bit.__init__.eps",description:`<strong>eps</strong> (<code>float</code>, defaults to 1e-8) &#x2014;
The epsilon value prevents division by zero in the optimizer.`,name:"eps"},{anchor:"bitsandbytes.optim.LAMB32bit.__init__.weight_decay",description:`<strong>weight_decay</strong> (<code>float</code>, defaults to 1e-2) &#x2014;
The weight decay value for the optimizer.`,name:"weight_decay"},{anchor:"bitsandbytes.optim.LAMB32bit.__init__.amsgrad",description:`<strong>amsgrad</strong> (<code>bool</code>, defaults to <code>False</code>) &#x2014;
Whether to use the <a href="https://hf.co/papers/1904.09237" rel="nofollow">AMSGrad</a> variant of Adam that uses the maximum of past squared gradients instead.`,name:"amsgrad"},{anchor:"bitsandbytes.optim.LAMB32bit.__init__.adam_w_mode",description:`<strong>adam_w_mode</strong> (<code>bool</code>, defaults to <code>True</code>) &#x2014;
Whether to use the AdamW variant.`,name:"adam_w_mode"},{anchor:"bitsandbytes.optim.LAMB32bit.__init__.args",description:`<strong>args</strong> (<code>object</code>, defaults to <code>None</code>) &#x2014;
An object with additional arguments.`,name:"args"},{anchor:"bitsandbytes.optim.LAMB32bit.__init__.min_8bit_size",description:`<strong>min_8bit_size</strong> (<code>int</code>, defaults to 4096) &#x2014;
The minimum number of elements of the parameter tensors for 8-bit optimization.`,name:"min_8bit_size"},{anchor:"bitsandbytes.optim.LAMB32bit.__init__.percentile_clipping",description:`<strong>percentile_clipping</strong> (<code>int</code>, defaults to 100) &#x2014;
Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.`,name:"percentile_clipping"},{anchor:"bitsandbytes.optim.LAMB32bit.__init__.block_wise",description:`<strong>block_wise</strong> (<code>bool</code>, defaults to <code>True</code>) &#x2014;
Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.`,name:"block_wise"},{anchor:"bitsandbytes.optim.LAMB32bit.__init__.max_unorm",description:`<strong>max_unorm</strong> (<code>float</code>, defaults to 1.0) &#x2014;
The maximum gradient norm.`,name:"max_unorm"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1532/bitsandbytes/optim/lamb.py#L140"}}),P=new Te({props:{source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/main/docs/source/reference/optim/lamb.mdx"}}),{c(){g=s("meta"),G=i(),H=s("p"),R=i(),d(L.$$.fragment),U=i(),w=s("p"),w.innerHTML=ue,O=i(),M=s("ul"),M.innerHTML=fe,J=i(),d(B.$$.fragment),K=i(),h=s("div"),d(T.$$.fragment),le=i(),v=s("div"),d(z.$$.fragment),ce=i(),q=s("p"),q.textContent=ve,Q=i(),d(k.$$.fragment),X=i(),u=s("div"),d(F.$$.fragment),pe=i(),y=s("div"),d(W.$$.fragment),be=i(),N=s("p"),N.textContent=ye,Y=i(),d(C.$$.fragment),Z=i(),f=s("div"),d(E.$$.fragment),_e=i(),$=s("div"),d(D.$$.fragment),ge=i(),j=s("p"),j.textContent=$e,ee=i(),d(P.$$.fragment),te=i(),I=s("p"),this.h()},l(e){const a=Be("svelte-u9bgzb",document.head);g=r(a,"META",{name:!0,content:!0}),a.forEach(t),G=o(e),H=r(e,"P",{}),A(H).forEach(t),R=o(e),l(L.$$.fragment,e),U=o(e),w=r(e,"P",{"data-svelte-h":!0}),V(w)!=="svelte-166pr93"&&(w.innerHTML=ue),O=o(e),M=r(e,"UL",{"data-svelte-h":!0}),V(M)!=="svelte-1v2rbtd"&&(M.innerHTML=fe),J=o(e),l(B.$$.fragment,e),K=o(e),h=r(e,"DIV",{class:!0});var ne=A(h);l(T.$$.fragment,ne),le=o(ne),v=r(ne,"DIV",{class:!0});var ie=A(v);l(z.$$.fragment,ie),ce=o(ie),q=r(ie,"P",{"data-svelte-h":!0}),V(q)!=="svelte-q646"&&(q.textContent=ve),ie.forEach(t),ne.forEach(t),Q=o(e),l(k.$$.fragment,e),X=o(e),u=r(e,"DIV",{class:!0});var oe=A(u);l(F.$$.fragment,oe),pe=o(oe),y=r(oe,"DIV",{class:!0});var se=A(y);l(W.$$.fragment,se),be=o(se),N=r(se,"P",{"data-svelte-h":!0}),V(N)!=="svelte-1glmsph"&&(N.textContent=ye),se.forEach(t),oe.forEach(t),Y=o(e),l(C.$$.fragment,e),Z=o(e),f=r(e,"DIV",{class:!0});var re=A(f);l(E.$$.fragment,re),_e=o(re),$=r(re,"DIV",{class:!0});var me=A($);l(D.$$.fragment,me),ge=o(me),j=r(me,"P",{"data-svelte-h":!0}),V(j)!=="svelte-1vurq1c"&&(j.textContent=$e),me.forEach(t),re.forEach(t),ee=o(e),l(P.$$.fragment,e),te=o(e),I=r(e,"P",{}),A(I).forEach(t),this.h()},h(){x(g,"name","hf:doc:metadata"),x(g,"content",ke),x(v,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),x(h,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),x(y,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),x(u,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),x($,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),x(f,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,a){m(document.head,g),n(e,G,a),n(e,H,a),n(e,R,a),c(L,e,a),n(e,U,a),n(e,w,a),n(e,O,a),n(e,M,a),n(e,J,a),c(B,e,a),n(e,K,a),n(e,h,a),c(T,h,null),m(h,le),m(h,v),c(z,v,null),m(v,ce),m(v,q),n(e,Q,a),c(k,e,a),n(e,X,a),n(e,u,a),c(F,u,null),m(u,pe),m(u,y),c(W,y,null),m(y,be),m(y,N),n(e,Y,a),c(C,e,a),n(e,Z,a),n(e,f,a),c(E,f,null),m(f,_e),m(f,$),c(D,$,null),m($,ge),m($,j),n(e,ee,a),c(P,e,a),n(e,te,a),n(e,I,a),ae=!0},p:xe,i(e){ae||(p(L.$$.fragment,e),p(B.$$.fragment,e),p(T.$$.fragment,e),p(z.$$.fragment,e),p(k.$$.fragment,e),p(F.$$.fragment,e),p(W.$$.fragment,e),p(C.$$.fragment,e),p(E.$$.fragment,e),p(D.$$.fragment,e),p(P.$$.fragment,e),ae=!0)},o(e){b(L.$$.fragment,e),b(B.$$.fragment,e),b(T.$$.fragment,e),b(z.$$.fragment,e),b(k.$$.fragment,e),b(F.$$.fragment,e),b(W.$$.fragment,e),b(C.$$.fragment,e),b(E.$$.fragment,e),b(D.$$.fragment,e),b(P.$$.fragment,e),ae=!1},d(e){e&&(t(G),t(H),t(R),t(U),t(w),t(O),t(M),t(J),t(K),t(h),t(Q),t(X),t(u),t(Y),t(Z),t(f),t(ee),t(te),t(I)),t(g),_(L,e),_(B,e),_(T),_(z),_(k,e),_(F),_(W),_(C,e),_(E),_(D),_(P,e)}}}const ke='{"title":"LAMB","local":"lamb","sections":[{"title":"LAMB","local":"api-class ][ bitsandbytes.optim.LAMB","sections":[],"depth":2},{"title":"LAMB8bit","local":"bitsandbytes.optim.LAMB8bit","sections":[],"depth":2},{"title":"LAMB32bit","local":"bitsandbytes.optim.LAMB32bit","sections":[],"depth":2}],"depth":1}';function Fe(he){return Le(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Pe extends we{constructor(g){super(),Me(this,g,Fe,ze,Ae,{})}}export{Pe as component};

Xet Storage Details

Size:
19 kB
·
Xet hash:
bbeee3e190ac0f1423ce3d219e809b7ce27c35a0fe06833e2265c9469c962c96

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.