Buckets:
| import{s as lt,n as pt,o as bt}from"../chunks/scheduler.852ec091.js";import{S as ct,i as _t,g as r,s as i,r as m,A as gt,h as d,f as t,c as n,j as g,u as l,x as z,k as h,y as s,a as o,v as p,d as b,t as c,w as _}from"../chunks/index.28275fd3.js";import{D as f}from"../chunks/Docstring.395987e7.js";import{H as te,E as ht}from"../chunks/EditOnGithub.582011f0.js";function ft(et){let u,le,de,pe,E,be,L,tt='<a href="https://hf.co/papers/1412.6980" rel="nofollow">Adam (Adaptive moment estimation)</a> is an adaptive learning rate optimizer, combining ideas from <code>SGD</code> with momentum and <code>RMSprop</code> to automatically scale the learning rate:',ce,N,at="<li>a weighted average of the past gradients to provide direction (first-moment)</li> <li>a weighted average of the <em>squared</em> past gradients to adapt the learning rate to each parameter (second-moment)</li>",_e,W,it="bitsandbytes also supports paged optimizers which take advantage of CUDAs unified memory to transfer memory from the GPU to the CPU when GPU memory is exhausted.",ge,q,he,v,j,Ge,P,M,He,ae,nt="Base Adam optimizer.",fe,I,ue,y,V,Ue,T,S,Re,ie,ot="8-bit Adam optimizer.",ve,G,ye,$,H,Be,k,U,Oe,ne,st="32-bit Adam optimizer.",$e,R,Ae,A,B,Je,F,O,Ke,oe,rt="Paged Adam optimizer.",xe,J,we,x,K,Qe,C,Q,Xe,se,dt="8-bit paged Adam optimizer.",ze,X,Pe,w,Y,Ye,D,Z,Ze,re,mt="Paged 32-bit Adam optimizer.",Te,ee,ke,me,Fe;return E=new te({props:{title:"Adam",local:"adam",headingTag:"h1"}}),q=new te({props:{title:"Adam",local:"api-class ][ bitsandbytes.optim.Adam",headingTag:"h2"}}),j=new f({props:{name:"class bitsandbytes.optim.Adam",anchor:"bitsandbytes.optim.Adam",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"},{name:"is_paged",val:" = False"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1187/bitsandbytes/optim/adam.py#L16"}}),M=new f({props:{name:"__init__",anchor:"bitsandbytes.optim.Adam.__init__",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"},{name:"is_paged",val:" = False"}],parametersDescription:[{anchor:"bitsandbytes.optim.Adam.__init__.params",description:`<strong>params</strong> (<code>torch.tensor</code>) — | |
| The input parameters to optimize.`,name:"params"},{anchor:"bitsandbytes.optim.Adam.__init__.lr",description:`<strong>lr</strong> (<code>float</code>, defaults to 1e-3) — | |
| The learning rate.`,name:"lr"},{anchor:"bitsandbytes.optim.Adam.__init__.betas",description:`<strong>betas</strong> (<code>tuple(float, float)</code>, defaults to (0.9, 0.999)) — | |
| The beta values are the decay rates of the first and second-order moment of the optimizer.`,name:"betas"},{anchor:"bitsandbytes.optim.Adam.__init__.eps",description:`<strong>eps</strong> (<code>float</code>, defaults to 1e-8) — | |
| The epsilon value prevents division by zero in the optimizer.`,name:"eps"},{anchor:"bitsandbytes.optim.Adam.__init__.weight_decay",description:`<strong>weight_decay</strong> (<code>float</code>, defaults to 0.0) — | |
| The weight decay value for the optimizer.`,name:"weight_decay"},{anchor:"bitsandbytes.optim.Adam.__init__.amsgrad",description:`<strong>amsgrad</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether to use the <a href="https://hf.co/papers/1904.09237" rel="nofollow">AMSGrad</a> variant of Adam that uses the maximum of past squared gradients instead.`,name:"amsgrad"},{anchor:"bitsandbytes.optim.Adam.__init__.optim_bits",description:`<strong>optim_bits</strong> (<code>int</code>, defaults to 32) — | |
| The number of bits of the optimizer state.`,name:"optim_bits"},{anchor:"bitsandbytes.optim.Adam.__init__.args",description:`<strong>args</strong> (<code>object</code>, defaults to <code>None</code>) — | |
| An object with additional arguments.`,name:"args"},{anchor:"bitsandbytes.optim.Adam.__init__.min_8bit_size",description:`<strong>min_8bit_size</strong> (<code>int</code>, defaults to 4096) — | |
| The minimum number of elements of the parameter tensors for 8-bit optimization.`,name:"min_8bit_size"},{anchor:"bitsandbytes.optim.Adam.__init__.percentile_clipping",description:`<strong>percentile_clipping</strong> (<code>int</code>, defaults to 100) — | |
| Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.`,name:"percentile_clipping"},{anchor:"bitsandbytes.optim.Adam.__init__.block_wise",description:`<strong>block_wise</strong> (<code>bool</code>, defaults to <code>True</code>) — | |
| Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.`,name:"block_wise"},{anchor:"bitsandbytes.optim.Adam.__init__.is_paged",description:`<strong>is_paged</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether the optimizer is a paged optimizer or not.`,name:"is_paged"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1187/bitsandbytes/optim/adam.py#L17"}}),I=new te({props:{title:"Adam8bit",local:"bitsandbytes.optim.Adam8bit",headingTag:"h2"}}),V=new f({props:{name:"class bitsandbytes.optim.Adam8bit",anchor:"bitsandbytes.optim.Adam8bit",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"},{name:"is_paged",val:" = False"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1187/bitsandbytes/optim/adam.py#L77"}}),S=new f({props:{name:"__init__",anchor:"bitsandbytes.optim.Adam8bit.__init__",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"},{name:"is_paged",val:" = False"}],parametersDescription:[{anchor:"bitsandbytes.optim.Adam8bit.__init__.params",description:`<strong>params</strong> (<code>torch.tensor</code>) — | |
| The input parameters to optimize.`,name:"params"},{anchor:"bitsandbytes.optim.Adam8bit.__init__.lr",description:`<strong>lr</strong> (<code>float</code>, defaults to 1e-3) — | |
| The learning rate.`,name:"lr"},{anchor:"bitsandbytes.optim.Adam8bit.__init__.betas",description:`<strong>betas</strong> (<code>tuple(float, float)</code>, defaults to (0.9, 0.999)) — | |
| The beta values are the decay rates of the first and second-order moment of the optimizer.`,name:"betas"},{anchor:"bitsandbytes.optim.Adam8bit.__init__.eps",description:`<strong>eps</strong> (<code>float</code>, defaults to 1e-8) — | |
| The epsilon value prevents division by zero in the optimizer.`,name:"eps"},{anchor:"bitsandbytes.optim.Adam8bit.__init__.weight_decay",description:`<strong>weight_decay</strong> (<code>float</code>, defaults to 0.0) — | |
| The weight decay value for the optimizer.`,name:"weight_decay"},{anchor:"bitsandbytes.optim.Adam8bit.__init__.amsgrad",description:`<strong>amsgrad</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether to use the <a href="https://hf.co/papers/1904.09237" rel="nofollow">AMSGrad</a> variant of Adam that uses the maximum of past squared gradients instead.`,name:"amsgrad"},{anchor:"bitsandbytes.optim.Adam8bit.__init__.optim_bits",description:`<strong>optim_bits</strong> (<code>int</code>, defaults to 32) — | |
| The number of bits of the optimizer state.`,name:"optim_bits"},{anchor:"bitsandbytes.optim.Adam8bit.__init__.args",description:`<strong>args</strong> (<code>object</code>, defaults to <code>None</code>) — | |
| An object with additional arguments.`,name:"args"},{anchor:"bitsandbytes.optim.Adam8bit.__init__.min_8bit_size",description:`<strong>min_8bit_size</strong> (<code>int</code>, defaults to 4096) — | |
| The minimum number of elements of the parameter tensors for 8-bit optimization.`,name:"min_8bit_size"},{anchor:"bitsandbytes.optim.Adam8bit.__init__.percentile_clipping",description:`<strong>percentile_clipping</strong> (<code>int</code>, defaults to 100) — | |
| Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.`,name:"percentile_clipping"},{anchor:"bitsandbytes.optim.Adam8bit.__init__.block_wise",description:`<strong>block_wise</strong> (<code>bool</code>, defaults to <code>True</code>) — | |
| Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.`,name:"block_wise"},{anchor:"bitsandbytes.optim.Adam8bit.__init__.is_paged",description:`<strong>is_paged</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether the optimizer is a paged optimizer or not.`,name:"is_paged"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1187/bitsandbytes/optim/adam.py#L78"}}),G=new te({props:{title:"Adam32bit",local:"bitsandbytes.optim.Adam32bit",headingTag:"h2"}}),H=new f({props:{name:"class bitsandbytes.optim.Adam32bit",anchor:"bitsandbytes.optim.Adam32bit",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"},{name:"is_paged",val:" = False"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1187/bitsandbytes/optim/adam.py#L138"}}),U=new f({props:{name:"__init__",anchor:"bitsandbytes.optim.Adam32bit.__init__",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"},{name:"is_paged",val:" = False"}],parametersDescription:[{anchor:"bitsandbytes.optim.Adam32bit.__init__.params",description:`<strong>params</strong> (<code>torch.tensor</code>) — | |
| The input parameters to optimize.`,name:"params"},{anchor:"bitsandbytes.optim.Adam32bit.__init__.lr",description:`<strong>lr</strong> (<code>float</code>, defaults to 1e-3) — | |
| The learning rate.`,name:"lr"},{anchor:"bitsandbytes.optim.Adam32bit.__init__.betas",description:`<strong>betas</strong> (<code>tuple(float, float)</code>, defaults to (0.9, 0.999)) — | |
| The beta values are the decay rates of the first and second-order moment of the optimizer.`,name:"betas"},{anchor:"bitsandbytes.optim.Adam32bit.__init__.eps",description:`<strong>eps</strong> (<code>float</code>, defaults to 1e-8) — | |
| The epsilon value prevents division by zero in the optimizer.`,name:"eps"},{anchor:"bitsandbytes.optim.Adam32bit.__init__.weight_decay",description:`<strong>weight_decay</strong> (<code>float</code>, defaults to 0.0) — | |
| The weight decay value for the optimizer.`,name:"weight_decay"},{anchor:"bitsandbytes.optim.Adam32bit.__init__.amsgrad",description:`<strong>amsgrad</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether to use the <a href="https://hf.co/papers/1904.09237" rel="nofollow">AMSGrad</a> variant of Adam that uses the maximum of past squared gradients instead.`,name:"amsgrad"},{anchor:"bitsandbytes.optim.Adam32bit.__init__.optim_bits",description:`<strong>optim_bits</strong> (<code>int</code>, defaults to 32) — | |
| The number of bits of the optimizer state.`,name:"optim_bits"},{anchor:"bitsandbytes.optim.Adam32bit.__init__.args",description:`<strong>args</strong> (<code>object</code>, defaults to <code>None</code>) — | |
| An object with additional arguments.`,name:"args"},{anchor:"bitsandbytes.optim.Adam32bit.__init__.min_8bit_size",description:`<strong>min_8bit_size</strong> (<code>int</code>, defaults to 4096) — | |
| The minimum number of elements of the parameter tensors for 8-bit optimization.`,name:"min_8bit_size"},{anchor:"bitsandbytes.optim.Adam32bit.__init__.percentile_clipping",description:`<strong>percentile_clipping</strong> (<code>int</code>, defaults to 100) — | |
| Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.`,name:"percentile_clipping"},{anchor:"bitsandbytes.optim.Adam32bit.__init__.block_wise",description:`<strong>block_wise</strong> (<code>bool</code>, defaults to <code>True</code>) — | |
| Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.`,name:"block_wise"},{anchor:"bitsandbytes.optim.Adam32bit.__init__.is_paged",description:`<strong>is_paged</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether the optimizer is a paged optimizer or not.`,name:"is_paged"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1187/bitsandbytes/optim/adam.py#L139"}}),R=new te({props:{title:"PagedAdam",local:"bitsandbytes.optim.PagedAdam",headingTag:"h2"}}),B=new f({props:{name:"class bitsandbytes.optim.PagedAdam",anchor:"bitsandbytes.optim.PagedAdam",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"},{name:"is_paged",val:" = False"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1187/bitsandbytes/optim/adam.py#L199"}}),O=new f({props:{name:"__init__",anchor:"bitsandbytes.optim.PagedAdam.__init__",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"},{name:"is_paged",val:" = False"}],parametersDescription:[{anchor:"bitsandbytes.optim.PagedAdam.__init__.params",description:`<strong>params</strong> (<code>torch.tensor</code>) — | |
| The input parameters to optimize.`,name:"params"},{anchor:"bitsandbytes.optim.PagedAdam.__init__.lr",description:`<strong>lr</strong> (<code>float</code>, defaults to 1e-3) — | |
| The learning rate.`,name:"lr"},{anchor:"bitsandbytes.optim.PagedAdam.__init__.betas",description:`<strong>betas</strong> (<code>tuple(float, float)</code>, defaults to (0.9, 0.999)) — | |
| The beta values are the decay rates of the first and second-order moment of the optimizer.`,name:"betas"},{anchor:"bitsandbytes.optim.PagedAdam.__init__.eps",description:`<strong>eps</strong> (<code>float</code>, defaults to 1e-8) — | |
| The epsilon value prevents division by zero in the optimizer.`,name:"eps"},{anchor:"bitsandbytes.optim.PagedAdam.__init__.weight_decay",description:`<strong>weight_decay</strong> (<code>float</code>, defaults to 0.0) — | |
| The weight decay value for the optimizer.`,name:"weight_decay"},{anchor:"bitsandbytes.optim.PagedAdam.__init__.amsgrad",description:`<strong>amsgrad</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether to use the <a href="https://hf.co/papers/1904.09237" rel="nofollow">AMSGrad</a> variant of Adam that uses the maximum of past squared gradients instead.`,name:"amsgrad"},{anchor:"bitsandbytes.optim.PagedAdam.__init__.optim_bits",description:`<strong>optim_bits</strong> (<code>int</code>, defaults to 32) — | |
| The number of bits of the optimizer state.`,name:"optim_bits"},{anchor:"bitsandbytes.optim.PagedAdam.__init__.args",description:`<strong>args</strong> (<code>object</code>, defaults to <code>None</code>) — | |
| An object with additional arguments.`,name:"args"},{anchor:"bitsandbytes.optim.PagedAdam.__init__.min_8bit_size",description:`<strong>min_8bit_size</strong> (<code>int</code>, defaults to 4096) — | |
| The minimum number of elements of the parameter tensors for 8-bit optimization.`,name:"min_8bit_size"},{anchor:"bitsandbytes.optim.PagedAdam.__init__.percentile_clipping",description:`<strong>percentile_clipping</strong> (<code>int</code>, defaults to 100) — | |
| Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.`,name:"percentile_clipping"},{anchor:"bitsandbytes.optim.PagedAdam.__init__.block_wise",description:`<strong>block_wise</strong> (<code>bool</code>, defaults to <code>True</code>) — | |
| Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.`,name:"block_wise"},{anchor:"bitsandbytes.optim.PagedAdam.__init__.is_paged",description:`<strong>is_paged</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether the optimizer is a paged optimizer or not.`,name:"is_paged"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1187/bitsandbytes/optim/adam.py#L200"}}),J=new te({props:{title:"PagedAdam8bit",local:"bitsandbytes.optim.PagedAdam8bit",headingTag:"h2"}}),K=new f({props:{name:"class bitsandbytes.optim.PagedAdam8bit",anchor:"bitsandbytes.optim.PagedAdam8bit",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"},{name:"is_paged",val:" = False"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1187/bitsandbytes/optim/adam.py#L260"}}),Q=new f({props:{name:"__init__",anchor:"bitsandbytes.optim.PagedAdam8bit.__init__",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"},{name:"is_paged",val:" = False"}],parametersDescription:[{anchor:"bitsandbytes.optim.PagedAdam8bit.__init__.params",description:`<strong>params</strong> (<code>torch.tensor</code>) — | |
| The input parameters to optimize.`,name:"params"},{anchor:"bitsandbytes.optim.PagedAdam8bit.__init__.lr",description:`<strong>lr</strong> (<code>float</code>, defaults to 1e-3) — | |
| The learning rate.`,name:"lr"},{anchor:"bitsandbytes.optim.PagedAdam8bit.__init__.betas",description:`<strong>betas</strong> (<code>tuple(float, float)</code>, defaults to (0.9, 0.999)) — | |
| The beta values are the decay rates of the first and second-order moment of the optimizer.`,name:"betas"},{anchor:"bitsandbytes.optim.PagedAdam8bit.__init__.eps",description:`<strong>eps</strong> (<code>float</code>, defaults to 1e-8) — | |
| The epsilon value prevents division by zero in the optimizer.`,name:"eps"},{anchor:"bitsandbytes.optim.PagedAdam8bit.__init__.weight_decay",description:`<strong>weight_decay</strong> (<code>float</code>, defaults to 0.0) — | |
| The weight decay value for the optimizer.`,name:"weight_decay"},{anchor:"bitsandbytes.optim.PagedAdam8bit.__init__.amsgrad",description:`<strong>amsgrad</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether to use the <a href="https://hf.co/papers/1904.09237" rel="nofollow">AMSGrad</a> variant of Adam that uses the maximum of past squared gradients instead.`,name:"amsgrad"},{anchor:"bitsandbytes.optim.PagedAdam8bit.__init__.optim_bits",description:`<strong>optim_bits</strong> (<code>int</code>, defaults to 32) — | |
| The number of bits of the optimizer state.`,name:"optim_bits"},{anchor:"bitsandbytes.optim.PagedAdam8bit.__init__.args",description:`<strong>args</strong> (<code>object</code>, defaults to <code>None</code>) — | |
| An object with additional arguments.`,name:"args"},{anchor:"bitsandbytes.optim.PagedAdam8bit.__init__.min_8bit_size",description:`<strong>min_8bit_size</strong> (<code>int</code>, defaults to 4096) — | |
| The minimum number of elements of the parameter tensors for 8-bit optimization.`,name:"min_8bit_size"},{anchor:"bitsandbytes.optim.PagedAdam8bit.__init__.percentile_clipping",description:`<strong>percentile_clipping</strong> (<code>int</code>, defaults to 100) — | |
| Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.`,name:"percentile_clipping"},{anchor:"bitsandbytes.optim.PagedAdam8bit.__init__.block_wise",description:`<strong>block_wise</strong> (<code>bool</code>, defaults to <code>True</code>) — | |
| Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.`,name:"block_wise"},{anchor:"bitsandbytes.optim.PagedAdam8bit.__init__.is_paged",description:`<strong>is_paged</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether the optimizer is a paged optimizer or not.`,name:"is_paged"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1187/bitsandbytes/optim/adam.py#L261"}}),X=new te({props:{title:"PagedAdam32bit",local:"bitsandbytes.optim.PagedAdam32bit",headingTag:"h2"}}),Y=new f({props:{name:"class bitsandbytes.optim.PagedAdam32bit",anchor:"bitsandbytes.optim.PagedAdam32bit",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"},{name:"is_paged",val:" = False"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1187/bitsandbytes/optim/adam.py#L321"}}),Z=new f({props:{name:"__init__",anchor:"bitsandbytes.optim.PagedAdam32bit.__init__",parameters:[{name:"params",val:""},{name:"lr",val:" = 0.001"},{name:"betas",val:" = (0.9, 0.999)"},{name:"eps",val:" = 1e-08"},{name:"weight_decay",val:" = 0"},{name:"amsgrad",val:" = False"},{name:"optim_bits",val:" = 32"},{name:"args",val:" = None"},{name:"min_8bit_size",val:" = 4096"},{name:"percentile_clipping",val:" = 100"},{name:"block_wise",val:" = True"},{name:"is_paged",val:" = False"}],parametersDescription:[{anchor:"bitsandbytes.optim.PagedAdam32bit.__init__.params",description:`<strong>params</strong> (<code>torch.tensor</code>) — | |
| The input parameters to optimize.`,name:"params"},{anchor:"bitsandbytes.optim.PagedAdam32bit.__init__.lr",description:`<strong>lr</strong> (<code>float</code>, defaults to 1e-3) — | |
| The learning rate.`,name:"lr"},{anchor:"bitsandbytes.optim.PagedAdam32bit.__init__.betas",description:`<strong>betas</strong> (<code>tuple(float, float)</code>, defaults to (0.9, 0.999)) — | |
| The beta values are the decay rates of the first and second-order moment of the optimizer.`,name:"betas"},{anchor:"bitsandbytes.optim.PagedAdam32bit.__init__.eps",description:`<strong>eps</strong> (<code>float</code>, defaults to 1e-8) — | |
| The epsilon value prevents division by zero in the optimizer.`,name:"eps"},{anchor:"bitsandbytes.optim.PagedAdam32bit.__init__.weight_decay",description:`<strong>weight_decay</strong> (<code>float</code>, defaults to 0.0) — | |
| The weight decay value for the optimizer.`,name:"weight_decay"},{anchor:"bitsandbytes.optim.PagedAdam32bit.__init__.amsgrad",description:`<strong>amsgrad</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether to use the <a href="https://hf.co/papers/1904.09237" rel="nofollow">AMSGrad</a> variant of Adam that uses the maximum of past squared gradients instead.`,name:"amsgrad"},{anchor:"bitsandbytes.optim.PagedAdam32bit.__init__.optim_bits",description:`<strong>optim_bits</strong> (<code>int</code>, defaults to 32) — | |
| The number of bits of the optimizer state.`,name:"optim_bits"},{anchor:"bitsandbytes.optim.PagedAdam32bit.__init__.args",description:`<strong>args</strong> (<code>object</code>, defaults to <code>None</code>) — | |
| An object with additional arguments.`,name:"args"},{anchor:"bitsandbytes.optim.PagedAdam32bit.__init__.min_8bit_size",description:`<strong>min_8bit_size</strong> (<code>int</code>, defaults to 4096) — | |
| The minimum number of elements of the parameter tensors for 8-bit optimization.`,name:"min_8bit_size"},{anchor:"bitsandbytes.optim.PagedAdam32bit.__init__.percentile_clipping",description:`<strong>percentile_clipping</strong> (<code>int</code>, defaults to 100) — | |
| Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.`,name:"percentile_clipping"},{anchor:"bitsandbytes.optim.PagedAdam32bit.__init__.block_wise",description:`<strong>block_wise</strong> (<code>bool</code>, defaults to <code>True</code>) — | |
| Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.`,name:"block_wise"},{anchor:"bitsandbytes.optim.PagedAdam32bit.__init__.is_paged",description:`<strong>is_paged</strong> (<code>bool</code>, defaults to <code>False</code>) — | |
| Whether the optimizer is a paged optimizer or not.`,name:"is_paged"}],source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/vr_1187/bitsandbytes/optim/adam.py#L322"}}),ee=new ht({props:{source:"https://github.com/bitsandbytes-foundation/bitsandbytes/blob/main/docs/source/reference/optim/adam.mdx"}}),{c(){u=r("meta"),le=i(),de=r("p"),pe=i(),m(E.$$.fragment),be=i(),L=r("p"),L.innerHTML=tt,ce=i(),N=r("ul"),N.innerHTML=at,_e=i(),W=r("p"),W.textContent=it,ge=i(),m(q.$$.fragment),he=i(),v=r("div"),m(j.$$.fragment),Ge=i(),P=r("div"),m(M.$$.fragment),He=i(),ae=r("p"),ae.textContent=nt,fe=i(),m(I.$$.fragment),ue=i(),y=r("div"),m(V.$$.fragment),Ue=i(),T=r("div"),m(S.$$.fragment),Re=i(),ie=r("p"),ie.textContent=ot,ve=i(),m(G.$$.fragment),ye=i(),$=r("div"),m(H.$$.fragment),Be=i(),k=r("div"),m(U.$$.fragment),Oe=i(),ne=r("p"),ne.textContent=st,$e=i(),m(R.$$.fragment),Ae=i(),A=r("div"),m(B.$$.fragment),Je=i(),F=r("div"),m(O.$$.fragment),Ke=i(),oe=r("p"),oe.textContent=rt,xe=i(),m(J.$$.fragment),we=i(),x=r("div"),m(K.$$.fragment),Qe=i(),C=r("div"),m(Q.$$.fragment),Xe=i(),se=r("p"),se.textContent=dt,ze=i(),m(X.$$.fragment),Pe=i(),w=r("div"),m(Y.$$.fragment),Ye=i(),D=r("div"),m(Z.$$.fragment),Ze=i(),re=r("p"),re.textContent=mt,Te=i(),m(ee.$$.fragment),ke=i(),me=r("p"),this.h()},l(e){const a=gt("svelte-u9bgzb",document.head);u=d(a,"META",{name:!0,content:!0}),a.forEach(t),le=n(e),de=d(e,"P",{}),g(de).forEach(t),pe=n(e),l(E.$$.fragment,e),be=n(e),L=d(e,"P",{"data-svelte-h":!0}),z(L)!=="svelte-et5h9y"&&(L.innerHTML=tt),ce=n(e),N=d(e,"UL",{"data-svelte-h":!0}),z(N)!=="svelte-1miac0o"&&(N.innerHTML=at),_e=n(e),W=d(e,"P",{"data-svelte-h":!0}),z(W)!=="svelte-qpasov"&&(W.textContent=it),ge=n(e),l(q.$$.fragment,e),he=n(e),v=d(e,"DIV",{class:!0});var Ce=g(v);l(j.$$.fragment,Ce),Ge=n(Ce),P=d(Ce,"DIV",{class:!0});var De=g(P);l(M.$$.fragment,De),He=n(De),ae=d(De,"P",{"data-svelte-h":!0}),z(ae)!=="svelte-o3b8f9"&&(ae.textContent=nt),De.forEach(t),Ce.forEach(t),fe=n(e),l(I.$$.fragment,e),ue=n(e),y=d(e,"DIV",{class:!0});var Ee=g(y);l(V.$$.fragment,Ee),Ue=n(Ee),T=d(Ee,"DIV",{class:!0});var Le=g(T);l(S.$$.fragment,Le),Re=n(Le),ie=d(Le,"P",{"data-svelte-h":!0}),z(ie)!=="svelte-17h2o0s"&&(ie.textContent=ot),Le.forEach(t),Ee.forEach(t),ve=n(e),l(G.$$.fragment,e),ye=n(e),$=d(e,"DIV",{class:!0});var Ne=g($);l(H.$$.fragment,Ne),Be=n(Ne),k=d(Ne,"DIV",{class:!0});var We=g(k);l(U.$$.fragment,We),Oe=n(We),ne=d(We,"P",{"data-svelte-h":!0}),z(ne)!=="svelte-1x8tgev"&&(ne.textContent=st),We.forEach(t),Ne.forEach(t),$e=n(e),l(R.$$.fragment,e),Ae=n(e),A=d(e,"DIV",{class:!0});var qe=g(A);l(B.$$.fragment,qe),Je=n(qe),F=d(qe,"DIV",{class:!0});var je=g(F);l(O.$$.fragment,je),Ke=n(je),oe=d(je,"P",{"data-svelte-h":!0}),z(oe)!=="svelte-1a1nn0n"&&(oe.textContent=rt),je.forEach(t),qe.forEach(t),xe=n(e),l(J.$$.fragment,e),we=n(e),x=d(e,"DIV",{class:!0});var Me=g(x);l(K.$$.fragment,Me),Qe=n(Me),C=d(Me,"DIV",{class:!0});var Ie=g(C);l(Q.$$.fragment,Ie),Xe=n(Ie),se=d(Ie,"P",{"data-svelte-h":!0}),z(se)!=="svelte-12q2iox"&&(se.textContent=dt),Ie.forEach(t),Me.forEach(t),ze=n(e),l(X.$$.fragment,e),Pe=n(e),w=d(e,"DIV",{class:!0});var Ve=g(w);l(Y.$$.fragment,Ve),Ye=n(Ve),D=d(Ve,"DIV",{class:!0});var Se=g(D);l(Z.$$.fragment,Se),Ze=n(Se),re=d(Se,"P",{"data-svelte-h":!0}),z(re)!=="svelte-1qxzzie"&&(re.textContent=mt),Se.forEach(t),Ve.forEach(t),Te=n(e),l(ee.$$.fragment,e),ke=n(e),me=d(e,"P",{}),g(me).forEach(t),this.h()},h(){h(u,"name","hf:doc:metadata"),h(u,"content",ut),h(P,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),h(v,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),h(T,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),h(y,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),h(k,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),h($,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),h(F,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),h(A,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),h(C,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),h(x,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),h(D,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),h(w,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,a){s(document.head,u),o(e,le,a),o(e,de,a),o(e,pe,a),p(E,e,a),o(e,be,a),o(e,L,a),o(e,ce,a),o(e,N,a),o(e,_e,a),o(e,W,a),o(e,ge,a),p(q,e,a),o(e,he,a),o(e,v,a),p(j,v,null),s(v,Ge),s(v,P),p(M,P,null),s(P,He),s(P,ae),o(e,fe,a),p(I,e,a),o(e,ue,a),o(e,y,a),p(V,y,null),s(y,Ue),s(y,T),p(S,T,null),s(T,Re),s(T,ie),o(e,ve,a),p(G,e,a),o(e,ye,a),o(e,$,a),p(H,$,null),s($,Be),s($,k),p(U,k,null),s(k,Oe),s(k,ne),o(e,$e,a),p(R,e,a),o(e,Ae,a),o(e,A,a),p(B,A,null),s(A,Je),s(A,F),p(O,F,null),s(F,Ke),s(F,oe),o(e,xe,a),p(J,e,a),o(e,we,a),o(e,x,a),p(K,x,null),s(x,Qe),s(x,C),p(Q,C,null),s(C,Xe),s(C,se),o(e,ze,a),p(X,e,a),o(e,Pe,a),o(e,w,a),p(Y,w,null),s(w,Ye),s(w,D),p(Z,D,null),s(D,Ze),s(D,re),o(e,Te,a),p(ee,e,a),o(e,ke,a),o(e,me,a),Fe=!0},p:pt,i(e){Fe||(b(E.$$.fragment,e),b(q.$$.fragment,e),b(j.$$.fragment,e),b(M.$$.fragment,e),b(I.$$.fragment,e),b(V.$$.fragment,e),b(S.$$.fragment,e),b(G.$$.fragment,e),b(H.$$.fragment,e),b(U.$$.fragment,e),b(R.$$.fragment,e),b(B.$$.fragment,e),b(O.$$.fragment,e),b(J.$$.fragment,e),b(K.$$.fragment,e),b(Q.$$.fragment,e),b(X.$$.fragment,e),b(Y.$$.fragment,e),b(Z.$$.fragment,e),b(ee.$$.fragment,e),Fe=!0)},o(e){c(E.$$.fragment,e),c(q.$$.fragment,e),c(j.$$.fragment,e),c(M.$$.fragment,e),c(I.$$.fragment,e),c(V.$$.fragment,e),c(S.$$.fragment,e),c(G.$$.fragment,e),c(H.$$.fragment,e),c(U.$$.fragment,e),c(R.$$.fragment,e),c(B.$$.fragment,e),c(O.$$.fragment,e),c(J.$$.fragment,e),c(K.$$.fragment,e),c(Q.$$.fragment,e),c(X.$$.fragment,e),c(Y.$$.fragment,e),c(Z.$$.fragment,e),c(ee.$$.fragment,e),Fe=!1},d(e){e&&(t(le),t(de),t(pe),t(be),t(L),t(ce),t(N),t(_e),t(W),t(ge),t(he),t(v),t(fe),t(ue),t(y),t(ve),t(ye),t($),t($e),t(Ae),t(A),t(xe),t(we),t(x),t(ze),t(Pe),t(w),t(Te),t(ke),t(me)),t(u),_(E,e),_(q,e),_(j),_(M),_(I,e),_(V),_(S),_(G,e),_(H),_(U),_(R,e),_(B),_(O),_(J,e),_(K),_(Q),_(X,e),_(Y),_(Z),_(ee,e)}}}const ut='{"title":"Adam","local":"adam","sections":[{"title":"Adam","local":"api-class ][ bitsandbytes.optim.Adam","sections":[],"depth":2},{"title":"Adam8bit","local":"bitsandbytes.optim.Adam8bit","sections":[],"depth":2},{"title":"Adam32bit","local":"bitsandbytes.optim.Adam32bit","sections":[],"depth":2},{"title":"PagedAdam","local":"bitsandbytes.optim.PagedAdam","sections":[],"depth":2},{"title":"PagedAdam8bit","local":"bitsandbytes.optim.PagedAdam8bit","sections":[],"depth":2},{"title":"PagedAdam32bit","local":"bitsandbytes.optim.PagedAdam32bit","sections":[],"depth":2}],"depth":1}';function vt(et){return bt(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class wt extends ct{constructor(u){super(),_t(this,u,vt,ft,lt,{})}}export{wt as component}; | |
Xet Storage Details
- Size:
- 33.8 kB
- Xet hash:
- 9cf2a39c34087dcf7dc8192e73ed5ec8eb80ad572938dfa4b26924a3c91840b8
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.