Buckets:
| import{s as rt,n as dt,o as mt}from"../chunks/scheduler.852ec091.js";import{S as lt,i as pt,g as r,s as i,r as m,A as bt,h as d,f as t,c as n,j as g,u as l,x as C,k as h,y as s,a as o,v as p,d as b,t as c,w as _}from"../chunks/index.28275fd3.js";import{D as f}from"../chunks/Docstring.ee6c313e.js";import{H as ee,E as ct}from"../chunks/EditOnGithub.582011f0.js";function _t(Ye){let u,me,re,le,D,pe,E,Ze='<a href="https://hf.co/papers/1711.05101" rel="nofollow">AdamW</a> is a variant of the <code>Adam</code> optimizer that separates weight decay from the gradient update based on the observation that the weight decay formulation is different when applied to <code>SGD</code> and <code>Adam</code>.',be,N,et="bitsandbytes also supports paged optimizers which take advantage of CUDAs unified memory to transfer memory from the GPU to the CPU when GPU memory is exhausted.",ce,q,_e,v,L,Ge,x,j,Me,te,tt="Base AdamW optimizer.",ge,I,he,y,V,Se,z,G,Ue,ae,at="8-bit AdamW optimizer.",fe,M,ue,$,S,He,P,U,Be,ie,it="32-bit AdamW optimizer.",ve,H,ye,A,B,Oe,T,O,Re,ne,nt="Paged AdamW optimizer.",$e,R,Ae,W,J,Je,k,K,Ke,oe,ot="Paged 8-bit AdamW optimizer.",We,Q,we,w,X,Qe,F,Y,Xe,se,st="Paged 32-bit AdamW optimizer.",xe,Z,ze,de,Pe;return D=new ee({props:{title:"AdamW",local:"adamw",headingTag:"h1"}}),q=new ee({props:{title:"AdamW",local:"api-class ][ bitsandbytes.optim.AdamW",headingTag:"h2"}}),L=new f({props:{name:"class bitsandbytes.optim.AdamW",anchor:"bitsandbytes.optim.AdamW",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0.01"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"},{name:"is_paged",val:" = False"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1431/bitsandbytes/optim/adamw.py#L8"}}),j=new f({props:{name:"__init__",anchor:"bitsandbytes.optim.AdamW.__init__",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0.01"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"},{name:"is_paged",val:" = False"}],parametersDescription:[{anchor:"bitsandbytes.optim.AdamW.__init__.params",description:`<strong>params</strong> (<code>torch.tensor</code>) — | |
| The input parameters to optimize.`,name:"params"},{anchor:"bitsandbytes.optim.AdamW.__init__.lr",description:`<strong>lr</strong> (<code>float</code>, defaults to 1e-3) — | |
| The learning rate.`,name:"lr"},{anchor:"bitsandbytes.optim.AdamW.__init__.betas",description:`<strong>betas</strong> (<code>tuple(float, float)</code>, defaults to (0.9, 0.999)) — | |
| The beta values are the decay rates of the first and second-order moment of the optimizer.`,name:"betas"},{anchor:"bitsandbytes.optim.AdamW.__init__.eps",description:`<strong>eps</strong> (<code>float</code>, defaults to 1e-8) — | |
| The epsilon value prevents division by zero in the optimizer.`,name:"eps"},{anchor:"bitsandbytes.optim.AdamW.__init__.weight_decay",description:`<strong>weight_decay</strong> (<code>float</code>, defaults to 1e-2) — | |
| The weight decay value for the optimizer.`,name:"weight_decay"},{anchor:"bitsandbytes.optim.AdamW.__init__.amsgrad",description:`<strong>amsgrad</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether to use the <a href="https://hf.co/papers/1904.09237" rel="nofollow">AMSGrad</a> variant of Adam that uses the maximum of past squared gradients instead.`,name:"amsgrad"},{anchor:"bitsandbytes.optim.AdamW.__init__.optim_bits",description:`<strong>optim_bits</strong> (<code>int</code>, defaults to 32) — | |
| The number of bits of the optimizer state.`,name:"optim_bits"},{anchor:"bitsandbytes.optim.AdamW.__init__.args",description:`<strong>args</strong> (<code>object</code>, defaults to <code>None</code>) — | |
| An object with additional arguments.`,name:"args"},{anchor:"bitsandbytes.optim.AdamW.__init__.min_8bit_size",description:`<strong>min_8bit_size</strong> (<code>int</code>, defaults to 4096) — | |
| The minimum number of elements of the parameter tensors for 8-bit optimization.`,name:"min_8bit_size"},{anchor:"bitsandbytes.optim.AdamW.__init__.percentile_clipping",description:`<strong>percentile_clipping</strong> (<code>int</code>, defaults to 100) — | |
| Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.`,name:"percentile_clipping"},{anchor:"bitsandbytes.optim.AdamW.__init__.block_wise",description:`<strong>block_wise</strong> (<code>bool</code>, defaults to <code>True</code>) — | |
| Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.`,name:"block_wise"},{anchor:"bitsandbytes.optim.AdamW.__init__.is_paged",description:`<strong>is_paged</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether the optimizer is a paged optimizer or not.`,name:"is_paged"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1431/bitsandbytes/optim/adamw.py#L9"}}),I=new ee({props:{title:"AdamW8bit",local:"bitsandbytes.optim.AdamW8bit",headingTag:"h2"}}),V=new f({props:{name:"class bitsandbytes.optim.AdamW8bit",anchor:"bitsandbytes.optim.AdamW8bit",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0.01"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"},{name:"is_paged",val:" = False"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1431/bitsandbytes/optim/adamw.py#L69"}}),G=new f({props:{name:"__init__",anchor:"bitsandbytes.optim.AdamW8bit.__init__",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0.01"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"},{name:"is_paged",val:" = False"}],parametersDescription:[{anchor:"bitsandbytes.optim.AdamW8bit.__init__.params",description:`<strong>params</strong> (<code>torch.tensor</code>) — | |
| The input parameters to optimize.`,name:"params"},{anchor:"bitsandbytes.optim.AdamW8bit.__init__.lr",description:`<strong>lr</strong> (<code>float</code>, defaults to 1e-3) — | |
| The learning rate.`,name:"lr"},{anchor:"bitsandbytes.optim.AdamW8bit.__init__.betas",description:`<strong>betas</strong> (<code>tuple(float, float)</code>, defaults to (0.9, 0.999)) — | |
| The beta values are the decay rates of the first and second-order moment of the optimizer.`,name:"betas"},{anchor:"bitsandbytes.optim.AdamW8bit.__init__.eps",description:`<strong>eps</strong> (<code>float</code>, defaults to 1e-8) — | |
| The epsilon value prevents division by zero in the optimizer.`,name:"eps"},{anchor:"bitsandbytes.optim.AdamW8bit.__init__.weight_decay",description:`<strong>weight_decay</strong> (<code>float</code>, defaults to 1e-2) — | |
| The weight decay value for the optimizer.`,name:"weight_decay"},{anchor:"bitsandbytes.optim.AdamW8bit.__init__.amsgrad",description:`<strong>amsgrad</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether to use the <a href="https://hf.co/papers/1904.09237" rel="nofollow">AMSGrad</a> variant of Adam that uses the maximum of past squared gradients instead.`,name:"amsgrad"},{anchor:"bitsandbytes.optim.AdamW8bit.__init__.optim_bits",description:`<strong>optim_bits</strong> (<code>int</code>, defaults to 32) — | |
| The number of bits of the optimizer state.`,name:"optim_bits"},{anchor:"bitsandbytes.optim.AdamW8bit.__init__.args",description:`<strong>args</strong> (<code>object</code>, defaults to <code>None</code>) — | |
| An object with additional arguments.`,name:"args"},{anchor:"bitsandbytes.optim.AdamW8bit.__init__.min_8bit_size",description:`<strong>min_8bit_size</strong> (<code>int</code>, defaults to 4096) — | |
| The minimum number of elements of the parameter tensors for 8-bit optimization.`,name:"min_8bit_size"},{anchor:"bitsandbytes.optim.AdamW8bit.__init__.percentile_clipping",description:`<strong>percentile_clipping</strong> (<code>int</code>, defaults to 100) — | |
| Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.`,name:"percentile_clipping"},{anchor:"bitsandbytes.optim.AdamW8bit.__init__.block_wise",description:`<strong>block_wise</strong> (<code>bool</code>, defaults to <code>True</code>) — | |
| Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.`,name:"block_wise"},{anchor:"bitsandbytes.optim.AdamW8bit.__init__.is_paged",description:`<strong>is_paged</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether the optimizer is a paged optimizer or not.`,name:"is_paged"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1431/bitsandbytes/optim/adamw.py#L70"}}),M=new ee({props:{title:"AdamW32bit",local:"bitsandbytes.optim.AdamW32bit",headingTag:"h2"}}),S=new f({props:{name:"class bitsandbytes.optim.AdamW32bit",anchor:"bitsandbytes.optim.AdamW32bit",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0.01"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"},{name:"is_paged",val:" = False"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1431/bitsandbytes/optim/adamw.py#L130"}}),U=new f({props:{name:"__init__",anchor:"bitsandbytes.optim.AdamW32bit.__init__",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0.01"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"},{name:"is_paged",val:" = False"}],parametersDescription:[{anchor:"bitsandbytes.optim.AdamW32bit.__init__.params",description:`<strong>params</strong> (<code>torch.tensor</code>) — | |
| The input parameters to optimize.`,name:"params"},{anchor:"bitsandbytes.optim.AdamW32bit.__init__.lr",description:`<strong>lr</strong> (<code>float</code>, defaults to 1e-3) — | |
| The learning rate.`,name:"lr"},{anchor:"bitsandbytes.optim.AdamW32bit.__init__.betas",description:`<strong>betas</strong> (<code>tuple(float, float)</code>, defaults to (0.9, 0.999)) — | |
| The beta values are the decay rates of the first and second-order moment of the optimizer.`,name:"betas"},{anchor:"bitsandbytes.optim.AdamW32bit.__init__.eps",description:`<strong>eps</strong> (<code>float</code>, defaults to 1e-8) — | |
| The epsilon value prevents division by zero in the optimizer.`,name:"eps"},{anchor:"bitsandbytes.optim.AdamW32bit.__init__.weight_decay",description:`<strong>weight_decay</strong> (<code>float</code>, defaults to 1e-2) — | |
| The weight decay value for the optimizer.`,name:"weight_decay"},{anchor:"bitsandbytes.optim.AdamW32bit.__init__.amsgrad",description:`<strong>amsgrad</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether to use the <a href="https://hf.co/papers/1904.09237" rel="nofollow">AMSGrad</a> variant of Adam that uses the maximum of past squared gradients instead.`,name:"amsgrad"},{anchor:"bitsandbytes.optim.AdamW32bit.__init__.optim_bits",description:`<strong>optim_bits</strong> (<code>int</code>, defaults to 32) — | |
| The number of bits of the optimizer state.`,name:"optim_bits"},{anchor:"bitsandbytes.optim.AdamW32bit.__init__.args",description:`<strong>args</strong> (<code>object</code>, defaults to <code>None</code>) — | |
| An object with additional arguments.`,name:"args"},{anchor:"bitsandbytes.optim.AdamW32bit.__init__.min_8bit_size",description:`<strong>min_8bit_size</strong> (<code>int</code>, defaults to 4096) — | |
| The minimum number of elements of the parameter tensors for 8-bit optimization.`,name:"min_8bit_size"},{anchor:"bitsandbytes.optim.AdamW32bit.__init__.percentile_clipping",description:`<strong>percentile_clipping</strong> (<code>int</code>, defaults to 100) — | |
| Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.`,name:"percentile_clipping"},{anchor:"bitsandbytes.optim.AdamW32bit.__init__.block_wise",description:`<strong>block_wise</strong> (<code>bool</code>, defaults to <code>True</code>) — | |
| Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.`,name:"block_wise"},{anchor:"bitsandbytes.optim.AdamW32bit.__init__.is_paged",description:`<strong>is_paged</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether the optimizer is a paged optimizer or not.`,name:"is_paged"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1431/bitsandbytes/optim/adamw.py#L131"}}),H=new ee({props:{title:"PagedAdamW",local:"bitsandbytes.optim.PagedAdamW",headingTag:"h2"}}),B=new f({props:{name:"class bitsandbytes.optim.PagedAdamW",anchor:"bitsandbytes.optim.PagedAdamW",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0.01"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1431/bitsandbytes/optim/adamw.py#L191"}}),O=new f({props:{name:"__init__",anchor:"bitsandbytes.optim.PagedAdamW.__init__",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0.01"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"}],parametersDescription:[{anchor:"bitsandbytes.optim.PagedAdamW.__init__.params",description:`<strong>params</strong> (<code>torch.tensor</code>) — | |
| The input parameters to optimize.`,name:"params"},{anchor:"bitsandbytes.optim.PagedAdamW.__init__.lr",description:`<strong>lr</strong> (<code>float</code>, defaults to 1e-3) — | |
| The learning rate.`,name:"lr"},{anchor:"bitsandbytes.optim.PagedAdamW.__init__.betas",description:`<strong>betas</strong> (<code>tuple(float, float)</code>, defaults to (0.9, 0.999)) — | |
| The beta values are the decay rates of the first and second-order moment of the optimizer.`,name:"betas"},{anchor:"bitsandbytes.optim.PagedAdamW.__init__.eps",description:`<strong>eps</strong> (<code>float</code>, defaults to 1e-8) — | |
| The epsilon value prevents division by zero in the optimizer.`,name:"eps"},{anchor:"bitsandbytes.optim.PagedAdamW.__init__.weight_decay",description:`<strong>weight_decay</strong> (<code>float</code>, defaults to 1e-2) — | |
| The weight decay value for the optimizer.`,name:"weight_decay"},{anchor:"bitsandbytes.optim.PagedAdamW.__init__.amsgrad",description:`<strong>amsgrad</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether to use the <a href="https://hf.co/papers/1904.09237" rel="nofollow">AMSGrad</a> variant of Adam that uses the maximum of past squared gradients instead.`,name:"amsgrad"},{anchor:"bitsandbytes.optim.PagedAdamW.__init__.optim_bits",description:`<strong>optim_bits</strong> (<code>int</code>, defaults to 32) — | |
| The number of bits of the optimizer state.`,name:"optim_bits"},{anchor:"bitsandbytes.optim.PagedAdamW.__init__.args",description:`<strong>args</strong> (<code>object</code>, defaults to <code>None</code>) — | |
| An object with additional arguments.`,name:"args"},{anchor:"bitsandbytes.optim.PagedAdamW.__init__.min_8bit_size",description:`<strong>min_8bit_size</strong> (<code>int</code>, defaults to 4096) — | |
| The minimum number of elements of the parameter tensors for 8-bit optimization.`,name:"min_8bit_size"},{anchor:"bitsandbytes.optim.PagedAdamW.__init__.percentile_clipping",description:`<strong>percentile_clipping</strong> (<code>int</code>, defaults to 100) — | |
| Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.`,name:"percentile_clipping"},{anchor:"bitsandbytes.optim.PagedAdamW.__init__.block_wise",description:`<strong>block_wise</strong> (<code>bool</code>, defaults to <code>True</code>) — | |
| Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.`,name:"block_wise"},{anchor:"bitsandbytes.optim.PagedAdamW.__init__.is_paged",description:`<strong>is_paged</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether the optimizer is a paged optimizer or not.`,name:"is_paged"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1431/bitsandbytes/optim/adamw.py#L192"}}),R=new ee({props:{title:"PagedAdamW8bit",local:"bitsandbytes.optim.PagedAdamW8bit",headingTag:"h2"}}),J=new f({props:{name:"class bitsandbytes.optim.PagedAdamW8bit",anchor:"bitsandbytes.optim.PagedAdamW8bit",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0.01"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1431/bitsandbytes/optim/adamw.py#L251"}}),K=new f({props:{name:"__init__",anchor:"bitsandbytes.optim.PagedAdamW8bit.__init__",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0.01"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"}],parametersDescription:[{anchor:"bitsandbytes.optim.PagedAdamW8bit.__init__.params",description:`<strong>params</strong> (<code>torch.tensor</code>) — | |
| The input parameters to optimize.`,name:"params"},{anchor:"bitsandbytes.optim.PagedAdamW8bit.__init__.lr",description:`<strong>lr</strong> (<code>float</code>, defaults to 1e-3) — | |
| The learning rate.`,name:"lr"},{anchor:"bitsandbytes.optim.PagedAdamW8bit.__init__.betas",description:`<strong>betas</strong> (<code>tuple(float, float)</code>, defaults to (0.9, 0.999)) — | |
| The beta values are the decay rates of the first and second-order moment of the optimizer.`,name:"betas"},{anchor:"bitsandbytes.optim.PagedAdamW8bit.__init__.eps",description:`<strong>eps</strong> (<code>float</code>, defaults to 1e-8) — | |
| The epsilon value prevents division by zero in the optimizer.`,name:"eps"},{anchor:"bitsandbytes.optim.PagedAdamW8bit.__init__.weight_decay",description:`<strong>weight_decay</strong> (<code>float</code>, defaults to 1e-2) — | |
| The weight decay value for the optimizer.`,name:"weight_decay"},{anchor:"bitsandbytes.optim.PagedAdamW8bit.__init__.amsgrad",description:`<strong>amsgrad</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether to use the <a href="https://hf.co/papers/1904.09237" rel="nofollow">AMSGrad</a> variant of Adam that uses the maximum of past squared gradients instead.`,name:"amsgrad"},{anchor:"bitsandbytes.optim.PagedAdamW8bit.__init__.optim_bits",description:`<strong>optim_bits</strong> (<code>int</code>, defaults to 32) — | |
| The number of bits of the optimizer state.`,name:"optim_bits"},{anchor:"bitsandbytes.optim.PagedAdamW8bit.__init__.args",description:`<strong>args</strong> (<code>object</code>, defaults to <code>None</code>) — | |
| An object with additional arguments.`,name:"args"},{anchor:"bitsandbytes.optim.PagedAdamW8bit.__init__.min_8bit_size",description:`<strong>min_8bit_size</strong> (<code>int</code>, defaults to 4096) — | |
| The minimum number of elements of the parameter tensors for 8-bit optimization.`,name:"min_8bit_size"},{anchor:"bitsandbytes.optim.PagedAdamW8bit.__init__.percentile_clipping",description:`<strong>percentile_clipping</strong> (<code>int</code>, defaults to 100) — | |
| Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.`,name:"percentile_clipping"},{anchor:"bitsandbytes.optim.PagedAdamW8bit.__init__.block_wise",description:`<strong>block_wise</strong> (<code>bool</code>, defaults to <code>True</code>) — | |
| Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.`,name:"block_wise"},{anchor:"bitsandbytes.optim.PagedAdamW8bit.__init__.is_paged",description:`<strong>is_paged</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether the optimizer is a paged optimizer or not.`,name:"is_paged"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1431/bitsandbytes/optim/adamw.py#L252"}}),Q=new ee({props:{title:"PagedAdamW32bit",local:"bitsandbytes.optim.PagedAdamW32bit",headingTag:"h2"}}),X=new f({props:{name:"class bitsandbytes.optim.PagedAdamW32bit",anchor:"bitsandbytes.optim.PagedAdamW32bit",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0.01"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1431/bitsandbytes/optim/adamw.py#L311"}}),Y=new f({props:{name:"__init__",anchor:"bitsandbytes.optim.PagedAdamW32bit.__init__",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0.01"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"}],parametersDescription:[{anchor:"bitsandbytes.optim.PagedAdamW32bit.__init__.params",description:`<strong>params</strong> (<code>torch.tensor</code>) — | |
| The input parameters to optimize.`,name:"params"},{anchor:"bitsandbytes.optim.PagedAdamW32bit.__init__.lr",description:`<strong>lr</strong> (<code>float</code>, defaults to 1e-3) — | |
| The learning rate.`,name:"lr"},{anchor:"bitsandbytes.optim.PagedAdamW32bit.__init__.betas",description:`<strong>betas</strong> (<code>tuple(float, float)</code>, defaults to (0.9, 0.999)) — | |
| The beta values are the decay rates of the first and second-order moment of the optimizer.`,name:"betas"},{anchor:"bitsandbytes.optim.PagedAdamW32bit.__init__.eps",description:`<strong>eps</strong> (<code>float</code>, defaults to 1e-8) — | |
| The epsilon value prevents division by zero in the optimizer.`,name:"eps"},{anchor:"bitsandbytes.optim.PagedAdamW32bit.__init__.weight_decay",description:`<strong>weight_decay</strong> (<code>float</code>, defaults to 1e-2) — | |
| The weight decay value for the optimizer.`,name:"weight_decay"},{anchor:"bitsandbytes.optim.PagedAdamW32bit.__init__.amsgrad",description:`<strong>amsgrad</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether to use the <a href="https://hf.co/papers/1904.09237" rel="nofollow">AMSGrad</a> variant of Adam that uses the maximum of past squared gradients instead.`,name:"amsgrad"},{anchor:"bitsandbytes.optim.PagedAdamW32bit.__init__.optim_bits",description:`<strong>optim_bits</strong> (<code>int</code>, defaults to 32) — | |
| The number of bits of the optimizer state.`,name:"optim_bits"},{anchor:"bitsandbytes.optim.PagedAdamW32bit.__init__.args",description:`<strong>args</strong> (<code>object</code>, defaults to <code>None</code>) — | |
| An object with additional arguments.`,name:"args"},{anchor:"bitsandbytes.optim.PagedAdamW32bit.__init__.min_8bit_size",description:`<strong>min_8bit_size</strong> (<code>int</code>, defaults to 4096) — | |
| The minimum number of elements of the parameter tensors for 8-bit optimization.`,name:"min_8bit_size"},{anchor:"bitsandbytes.optim.PagedAdamW32bit.__init__.percentile_clipping",description:`<strong>percentile_clipping</strong> (<code>int</code>, defaults to 100) — | |
| Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.`,name:"percentile_clipping"},{anchor:"bitsandbytes.optim.PagedAdamW32bit.__init__.block_wise",description:`<strong>block_wise</strong> (<code>bool</code>, defaults to <code>True</code>) — | |
| Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.`,name:"block_wise"},{anchor:"bitsandbytes.optim.PagedAdamW32bit.__init__.is_paged",description:`<strong>is_paged</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether the optimizer is a paged optimizer or not.`,name:"is_paged"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1431/bitsandbytes/optim/adamw.py#L312"}}),Z=new ct({props:{source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/main/docs/source/reference/optim/adamw.mdx"}}),{c(){u=r("meta"),me=i(),re=r("p"),le=i(),m(D.$$.fragment),pe=i(),E=r("p"),E.innerHTML=Ze,be=i(),N=r("p"),N.textContent=et,ce=i(),m(q.$$.fragment),_e=i(),v=r("div"),m(L.$$.fragment),Ge=i(),x=r("div"),m(j.$$.fragment),Me=i(),te=r("p"),te.textContent=tt,ge=i(),m(I.$$.fragment),he=i(),y=r("div"),m(V.$$.fragment),Se=i(),z=r("div"),m(G.$$.fragment),Ue=i(),ae=r("p"),ae.textContent=at,fe=i(),m(M.$$.fragment),ue=i(),$=r("div"),m(S.$$.fragment),He=i(),P=r("div"),m(U.$$.fragment),Be=i(),ie=r("p"),ie.textContent=it,ve=i(),m(H.$$.fragment),ye=i(),A=r("div"),m(B.$$.fragment),Oe=i(),T=r("div"),m(O.$$.fragment),Re=i(),ne=r("p"),ne.textContent=nt,$e=i(),m(R.$$.fragment),Ae=i(),W=r("div"),m(J.$$.fragment),Je=i(),k=r("div"),m(K.$$.fragment),Ke=i(),oe=r("p"),oe.textContent=ot,We=i(),m(Q.$$.fragment),we=i(),w=r("div"),m(X.$$.fragment),Qe=i(),F=r("div"),m(Y.$$.fragment),Xe=i(),se=r("p"),se.textContent=st,xe=i(),m(Z.$$.fragment),ze=i(),de=r("p"),this.h()},l(e){const a=bt("svelte-u9bgzb",document.head);u=d(a,"META",{name:!0,content:!0}),a.forEach(t),me=n(e),re=d(e,"P",{}),g(re).forEach(t),le=n(e),l(D.$$.fragment,e),pe=n(e),E=d(e,"P",{"data-svelte-h":!0}),C(E)!=="svelte-zypq46"&&(E.innerHTML=Ze),be=n(e),N=d(e,"P",{"data-svelte-h":!0}),C(N)!=="svelte-qpasov"&&(N.textContent=et),ce=n(e),l(q.$$.fragment,e),_e=n(e),v=d(e,"DIV",{class:!0});var Te=g(v);l(L.$$.fragment,Te),Ge=n(Te),x=d(Te,"DIV",{class:!0});var ke=g(x);l(j.$$.fragment,ke),Me=n(ke),te=d(ke,"P",{"data-svelte-h":!0}),C(te)!=="svelte-cz2sic"&&(te.textContent=tt),ke.forEach(t),Te.forEach(t),ge=n(e),l(I.$$.fragment,e),he=n(e),y=d(e,"DIV",{class:!0});var Fe=g(y);l(V.$$.fragment,Fe),Se=n(Fe),z=d(Fe,"DIV",{class:!0});var Ce=g(z);l(G.$$.fragment,Ce),Ue=n(Ce),ae=d(Ce,"P",{"data-svelte-h":!0}),C(ae)!=="svelte-1jiw1x7"&&(ae.textContent=at),Ce.forEach(t),Fe.forEach(t),fe=n(e),l(M.$$.fragment,e),ue=n(e),$=d(e,"DIV",{class:!0});var De=g($);l(S.$$.fragment,De),He=n(De),P=d(De,"DIV",{class:!0});var Ee=g(P);l(U.$$.fragment,Ee),Be=n(Ee),ie=d(Ee,"P",{"data-svelte-h":!0}),C(ie)!=="svelte-gm8nxe"&&(ie.textContent=it),Ee.forEach(t),De.forEach(t),ve=n(e),l(H.$$.fragment,e),ye=n(e),A=d(e,"DIV",{class:!0});var Ne=g(A);l(B.$$.fragment,Ne),Oe=n(Ne),T=d(Ne,"DIV",{class:!0});var qe=g(T);l(O.$$.fragment,qe),Re=n(qe),ne=d(qe,"P",{"data-svelte-h":!0}),C(ne)!=="svelte-we884g"&&(ne.textContent=nt),qe.forEach(t),Ne.forEach(t),$e=n(e),l(R.$$.fragment,e),Ae=n(e),W=d(e,"DIV",{class:!0});var Le=g(W);l(J.$$.fragment,Le),Je=n(Le),k=d(Le,"DIV",{class:!0});var je=g(k);l(K.$$.fragment,je),Ke=n(je),oe=d(je,"P",{"data-svelte-h":!0}),C(oe)!=="svelte-y5nwi"&&(oe.textContent=ot),je.forEach(t),Le.forEach(t),We=n(e),l(Q.$$.fragment,e),we=n(e),w=d(e,"DIV",{class:!0});var Ie=g(w);l(X.$$.fragment,Ie),Qe=n(Ie),F=d(Ie,"DIV",{class:!0});var Ve=g(F);l(Y.$$.fragment,Ve),Xe=n(Ve),se=d(Ve,"P",{"data-svelte-h":!0}),C(se)!=="svelte-10llmb"&&(se.textContent=st),Ve.forEach(t),Ie.forEach(t),xe=n(e),l(Z.$$.fragment,e),ze=n(e),de=d(e,"P",{}),g(de).forEach(t),this.h()},h(){h(u,"name","hf:doc:metadata"),h(u,"content",gt),h(x,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),h(v,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),h(z,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),h(y,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),h(P,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),h($,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),h(T,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),h(A,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),h(k,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),h(W,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),h(F,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),h(w,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,a){s(document.head,u),o(e,me,a),o(e,re,a),o(e,le,a),p(D,e,a),o(e,pe,a),o(e,E,a),o(e,be,a),o(e,N,a),o(e,ce,a),p(q,e,a),o(e,_e,a),o(e,v,a),p(L,v,null),s(v,Ge),s(v,x),p(j,x,null),s(x,Me),s(x,te),o(e,ge,a),p(I,e,a),o(e,he,a),o(e,y,a),p(V,y,null),s(y,Se),s(y,z),p(G,z,null),s(z,Ue),s(z,ae),o(e,fe,a),p(M,e,a),o(e,ue,a),o(e,$,a),p(S,$,null),s($,He),s($,P),p(U,P,null),s(P,Be),s(P,ie),o(e,ve,a),p(H,e,a),o(e,ye,a),o(e,A,a),p(B,A,null),s(A,Oe),s(A,T),p(O,T,null),s(T,Re),s(T,ne),o(e,$e,a),p(R,e,a),o(e,Ae,a),o(e,W,a),p(J,W,null),s(W,Je),s(W,k),p(K,k,null),s(k,Ke),s(k,oe),o(e,We,a),p(Q,e,a),o(e,we,a),o(e,w,a),p(X,w,null),s(w,Qe),s(w,F),p(Y,F,null),s(F,Xe),s(F,se),o(e,xe,a),p(Z,e,a),o(e,ze,a),o(e,de,a),Pe=!0},p:dt,i(e){Pe||(b(D.$$.fragment,e),b(q.$$.fragment,e),b(L.$$.fragment,e),b(j.$$.fragment,e),b(I.$$.fragment,e),b(V.$$.fragment,e),b(G.$$.fragment,e),b(M.$$.fragment,e),b(S.$$.fragment,e),b(U.$$.fragment,e),b(H.$$.fragment,e),b(B.$$.fragment,e),b(O.$$.fragment,e),b(R.$$.fragment,e),b(J.$$.fragment,e),b(K.$$.fragment,e),b(Q.$$.fragment,e),b(X.$$.fragment,e),b(Y.$$.fragment,e),b(Z.$$.fragment,e),Pe=!0)},o(e){c(D.$$.fragment,e),c(q.$$.fragment,e),c(L.$$.fragment,e),c(j.$$.fragment,e),c(I.$$.fragment,e),c(V.$$.fragment,e),c(G.$$.fragment,e),c(M.$$.fragment,e),c(S.$$.fragment,e),c(U.$$.fragment,e),c(H.$$.fragment,e),c(B.$$.fragment,e),c(O.$$.fragment,e),c(R.$$.fragment,e),c(J.$$.fragment,e),c(K.$$.fragment,e),c(Q.$$.fragment,e),c(X.$$.fragment,e),c(Y.$$.fragment,e),c(Z.$$.fragment,e),Pe=!1},d(e){e&&(t(me),t(re),t(le),t(pe),t(E),t(be),t(N),t(ce),t(_e),t(v),t(ge),t(he),t(y),t(fe),t(ue),t($),t(ve),t(ye),t(A),t($e),t(Ae),t(W),t(We),t(we),t(w),t(xe),t(ze),t(de)),t(u),_(D,e),_(q,e),_(L),_(j),_(I,e),_(V),_(G),_(M,e),_(S),_(U),_(H,e),_(B),_(O),_(R,e),_(J),_(K),_(Q,e),_(X),_(Y),_(Z,e)}}}const gt='{"title":"AdamW","local":"adamw","sections":[{"title":"AdamW","local":"api-class ][ bitsandbytes.optim.AdamW","sections":[],"depth":2},{"title":"AdamW8bit","local":"bitsandbytes.optim.AdamW8bit","sections":[],"depth":2},{"title":"AdamW32bit","local":"bitsandbytes.optim.AdamW32bit","sections":[],"depth":2},{"title":"PagedAdamW","local":"bitsandbytes.optim.PagedAdamW","sections":[],"depth":2},{"title":"PagedAdamW8bit","local":"bitsandbytes.optim.PagedAdamW8bit","sections":[],"depth":2},{"title":"PagedAdamW32bit","local":"bitsandbytes.optim.PagedAdamW32bit","sections":[],"depth":2}],"depth":1}';function ht(Ye){return mt(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class $t extends lt{constructor(u){super(),pt(this,u,ht,_t,rt,{})}}export{$t as component}; | |
Xet Storage Details
- Size:
- 33.4 kB
- Xet hash:
- 1ac9dde322fc85429078e698a4b9ee99311ce151c6b1198d8c2edc1c98457810
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.