From 63defeadaab055a389fac08e7266ab43860f7d30 Mon Sep 17 00:00:00 2001 From: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> Date: Thu, 19 Feb 2026 17:15:05 -0800 Subject: [PATCH 1/2] Update cudnn-frontend to v1.18 (#2689) update FE to 1.18 Signed-off-by: Charlene Yang <8636796+cyanguwa@users.noreply.github.com> --- 3rdparty/cudnn-frontend | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/cudnn-frontend b/3rdparty/cudnn-frontend index b372d39879..8d19d3182b 160000 --- a/3rdparty/cudnn-frontend +++ b/3rdparty/cudnn-frontend @@ -1 +1 @@ -Subproject commit b372d39879d44c91a8d5b342022e74802b6a8da2 +Subproject commit 8d19d3182bfbc304046a15e9236bec9ff31511fc From e5832221aa901cd9b0e8315926728d2b1c4e942a Mon Sep 17 00:00:00 2001 From: Tim Moon <4406448+timmoon10@users.noreply.github.com> Date: Thu, 19 Feb 2026 17:39:35 -0800 Subject: [PATCH 2/2] [PyTorch] Documentation for op fuser API (#2447) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add documentation for operation fuser API Signed-off-by: Tim Moon * Include TE ops in PyTorch API docs Signed-off-by: Tim Moon * Fix error when building docs Signed-off-by: Tim Moon * Fix typo Review suggestion from @greptile-apps Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> * Fix swapped args to `te.ops.Linear` Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> * Update copyright year Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> * Reorganize TE ops guide Signed-off-by: Tim Moon * Fix typo Signed-off-by: Tim Moon * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Apply suggestions from code review Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> * Debug test failure Signed-off-by: Tim Moon * Debug failure when autogenerating docs Signed-off-by: Tim Moon * Debug errors when building docs Signed-off-by: Tim Moon * Include fusion registration functions in docs Signed-off-by: Tim Moon * Update docs/api/pytorch.rst Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> * Debug failure when building docs Signed-off-by: Tim Moon * Poke GitHub Signed-off-by: Tim Moon --------- Signed-off-by: Tim Moon Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> Co-authored-by: Paweł Gadziński <62263673+pggPL@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .gitignore | 2 +- docs/api/pytorch.rst | 80 ++++ .../op_fuser/fp8_layernorm_linear.png | Bin 0 -> 17749 bytes docs/examples/op_fuser/layernorm_mlp.png | Bin 0 -> 28980 bytes docs/examples/op_fuser/op_fuser.rst | 353 ++++++++++++++++++ .../op_fuser/residual_layernorm_mlp.png | Bin 0 -> 15620 bytes docs/index.rst | 1 + tests/pytorch/test_fusible_ops.py | 136 ++++++- .../pytorch/ops/basic/activation.py | 16 +- .../pytorch/ops/basic/add_extra_input.py | 2 +- .../pytorch/ops/basic/basic_linear.py | 54 +-- transformer_engine/pytorch/ops/basic/bias.py | 4 +- .../pytorch/ops/basic/grouped_linear.py | 2 +- .../pytorch/ops/basic/layer_norm.py | 8 +- .../pytorch/ops/basic/make_extra_output.py | 2 +- .../pytorch/ops/basic/quantize.py | 8 +- .../pytorch/ops/basic/reshape.py | 2 +- .../pytorch/ops/basic/rmsnorm.py | 6 +- .../pytorch/ops/basic/swiglu.py | 12 +- .../ops/fused/userbuffers_backward_linear.py | 8 +- .../ops/fused/userbuffers_forward_linear.py | 10 +- transformer_engine/pytorch/ops/fuser.py | 6 +- transformer_engine/pytorch/ops/linear.py | 18 +- transformer_engine/pytorch/ops/op.py | 4 +- transformer_engine/pytorch/ops/sequential.py | 6 +- 25 files changed, 645 insertions(+), 95 deletions(-) create mode 100644 docs/examples/op_fuser/fp8_layernorm_linear.png create mode 100644 docs/examples/op_fuser/layernorm_mlp.png create mode 100644 docs/examples/op_fuser/op_fuser.rst create mode 100644 docs/examples/op_fuser/residual_layernorm_mlp.png diff --git a/.gitignore b/.gitignore index 7a86041a1e..789d3b0a5f 100644 --- a/.gitignore +++ b/.gitignore @@ -41,4 +41,4 @@ compile_commands.json .nfs tensor_dumps/ artifacts/ -*.DS_Store +.DS_Store diff --git a/docs/api/pytorch.rst b/docs/api/pytorch.rst index d1d54c0dda..90f68653cc 100644 --- a/docs/api/pytorch.rst +++ b/docs/api/pytorch.rst @@ -143,6 +143,86 @@ Tensor saving and restoring functions .. autoapifunction:: transformer_engine.pytorch.restore_from_saved +Operation fuser +--------------- + +.. autoapiclass:: transformer_engine.pytorch.ops.Sequential + :members: forward + +.. autoapiclass:: transformer_engine.pytorch.ops.FusibleOperation + :members: fuser_forward, fuser_backward + +.. autoapiclass:: transformer_engine.pytorch.ops.BasicOperation + :members: op_forward, op_backward + +.. autoapiclass:: transformer_engine.pytorch.ops.FusedOperation + :members: fuser_forward, fuser_backward + +.. autoapifunction:: transformer_engine.pytorch.ops.register_forward_fusion + +.. autoapifunction:: transformer_engine.pytorch.ops.register_backward_fusion + +.. autoapiclass:: transformer_engine.pytorch.ops.Linear + +.. autoapiclass:: transformer_engine.pytorch.ops.AddExtraInput + +.. autoapiclass:: transformer_engine.pytorch.ops.AllGather + +.. autoapiclass:: transformer_engine.pytorch.ops.AllReduce + +.. autoapiclass:: transformer_engine.pytorch.ops.BasicLinear + :members: _functional_forward, _functional_backward + +.. autoapiclass:: transformer_engine.pytorch.ops.Bias + +.. autoapiclass:: transformer_engine.pytorch.ops.ClampedSwiGLU + +.. autoapiclass:: transformer_engine.pytorch.ops.ConstantScale + +.. autoapiclass:: transformer_engine.pytorch.ops.Dropout + +.. autoapiclass:: transformer_engine.pytorch.ops.GEGLU + +.. autoapiclass:: transformer_engine.pytorch.ops.GELU + +.. autoapiclass:: transformer_engine.pytorch.ops.GLU + +.. autoapiclass:: transformer_engine.pytorch.ops.GroupedLinear + +.. autoapiclass:: transformer_engine.pytorch.ops.Identity + +.. autoapiclass:: transformer_engine.pytorch.ops.L2Normalization + +.. autoapiclass:: transformer_engine.pytorch.ops.LayerNorm + +.. autoapiclass:: transformer_engine.pytorch.ops.MakeExtraOutput + +.. autoapiclass:: transformer_engine.pytorch.ops.QGELU + +.. autoapiclass:: transformer_engine.pytorch.ops.QGEGLU + +.. autoapiclass:: transformer_engine.pytorch.ops.Quantize + +.. autoapiclass:: transformer_engine.pytorch.ops.ReGLU + +.. autoapiclass:: transformer_engine.pytorch.ops.ReLU + +.. autoapiclass:: transformer_engine.pytorch.ops.ReduceScatter + +.. autoapiclass:: transformer_engine.pytorch.ops.Reshape + +.. autoapiclass:: transformer_engine.pytorch.ops.RMSNorm + +.. autoapiclass:: transformer_engine.pytorch.ops.SReGLU + +.. autoapiclass:: transformer_engine.pytorch.ops.SReLU + +.. autoapiclass:: transformer_engine.pytorch.ops.ScaledSwiGLU + +.. autoapiclass:: transformer_engine.pytorch.ops.SiLU + +.. autoapiclass:: transformer_engine.pytorch.ops.SwiGLU + Deprecated functions -------------------- diff --git a/docs/examples/op_fuser/fp8_layernorm_linear.png b/docs/examples/op_fuser/fp8_layernorm_linear.png new file mode 100644 index 0000000000000000000000000000000000000000..b5916a615281f02d9eb61182fcac05b5b9acc0dc GIT binary patch literal 17749 zcmZ^r1ymhD(x`EF4=w?MyL)hVm*DR1!3hKnuEE{i-Q6L$yKC?_?C$^XzCG_<&ONv3 z>8kFYuIaf`U)2p$kP}CQ!-WF@0YQ|M5K#gF0Zj+q5HMgsO&k`cAn*ZUCL}8a0#Y3f z|6%|M{7-Bop(G0e;zw|!BCxd`s+o!iE@&X5fOf)1- zWo1FAfiesTBq$mPI8Xuw{y;!+K|ZtrB@ii4y#FmLfqwZ{8(;{b79bG+YNG{we!S3t z_lL|sKf$v=|E)0#?0;H=re}fwZ}~$5$jfQu#|Nyvgr*Y+2w4Be8#HX;`vp)Ku~5-) z){vFqHnOv!H!!v{G@*C5vHxfe!t2fr6m3kL4T#-stZkjR-T6rW(SjQ&e^fJ&691!# zvlSnyhO7dyu$`j`F*`jYJtHYU95FF5ucNUkw~~n1zr=z6_(;v2o$a|97~I_4=-pW8 z?HtV*n7FvO7#Nuun3?H-7IaP?w$29bbhb`p|5WlndPGc|j2tcOoh|Hai9hrj7}~iw z^O2H%bo9Tke~#1H!t~!g**g7eTEGMuK8`Rj(K9mqzr;-3E&e}ZA4mQv_K$h}vpe1o zW!#!dCQf$NE+50-y|E&CP z0eMFY6W~Ptb6o%G<3H>Eqy5M1xD_niO{_IUENo0{oj&Hm!OY0Z@c%sW-;%<1)^?7{ z_69~K{7nB6`H!mqZvBsbH2>WXD zAK0i2;l%eyIYdhN=;60$Fra8N=rdjnt$!|l+o@lEDuUKt#;uLuVkVpwxI)NgSLo z#1BBk5*+9W2Ln{e5=u2g1qaY1QLKnZ3?|Z*1@y;Kbez}o(YsXSGN1wsO4KhtRY;W* zGX&72WH|=~C}#Wu^oLNwUCan*0*1)`KSL}yJ8oNh+oGONO-(H;qpiq`#V6GV0rXo` zqM@Us_xAR_&%cHJ_|cW`X(9dj*PmK57KcjR)&}2K&%0d(T2lQO2$9=f8m-dBSTga6 zbS|f@o?wZqooIpe4xi_%ok1RHf;&>QFyo6Fe#2_cFJBHPGI(m-FxRZz`?Yt4ArO*5 z?&j)Ln#ae-vFUZ(Z>PmmZV5ehV}D{Z^xTv;t}nN`G4P(8oP;71Aq^JDrcFiTv6@fi zi2k(qc)C6M#9|tO%T{7GnOWI(Ev2dXC!R`KN=k}CHnq;67iv|vN?*vq;p*?be5dd0 z{zL|R`NJ5?+?flHMRNFU=?gKp6gYRa?9=Q?ct%h znVA`dw7rO^Xm@Q+PEJJyQf7I1xy%84dlfpRoV%Bo@K3&nRj0_vNQLji!^8LY_fyJ~ z{Li%U#62%>&lf*jZVwfUxa8$01EG-W4 z?cG^zcG}-p+3B&sh)36hQ9wU{EX`365a_I{s|yK%!oy3`ZFP+#Q-1# zkmDB88XFs{;A+o%)rX!pt)a8as1L^aGN-inl_vXlFsPBV^eC{7j5DWet&M4Ec&r$; zhL?jx--&5dRaNmgY(?JT+vy@mQ^u-Cex#VNu(1WB+|#3*UTyc|)bT&=#x?cr>gwp= z@1HW>_w5oqnO};CN5e{Ph;&uK#wg9|y?&J^;Is$PI$f$Z!oe|Q4X>+?YX1K2>s-lh zw~k;FjD+hOLWaP1-=P?in}bPl zj;Wp=Fk2XZOCz#hfiO7WpjxkUu5?C1ASb^T78W!}HEN8*GBOAp{ZiPlv9Z}{n)>q6 z(&>#Zn2~6iy9_KWs17FefQDb|H8nM>`FDxI1_XgihiQ&EAr=M)6%^MOSJ6aSG0cw% z1-OSatmgNcdwH8{WM{z-DQ;z+0EOjJ`%ePTH3tJLH4cFR7w z+j6QmG-N&ta`5bIA}BLptGUqTQq6u`EzE2Ve#J$hY}cllziSV$K<7lj(Vij~91ONlt3k~f~CHJT?< zSd~H?l#Nud9W5?e9@BGKoAx(K7OF(}Dy%=p;Nvrwrmu5JR^0QhEClC=%FiU_mQu*r zR`6fD9aaqcvOC$!L#4lJu`$d@enC|r7L>1%6q1EkgTfe!fi{B(&?$$rT0SK@6o#n# zIGWTYFU@_FaX%0d?*E_WH+p0uH_+qbm;;+&suOG zJ+Ng>xq%I|%G->`w!Jn|D}G;kdwcwo6G)Y?uwRg777W#@b>fw;^ad;rI^%lgvNV}@ zc7If%y0oke^Jv3kb&Pbh0DlK_H>YcH{Wn-n76tgjcT=U0^#hHQfL)Fu*ht8u}<*L8uUv?zO*`l^MG>H$Uu}B~4t}4nQ83kd*HP6C0 zw964mkJ%D|+1i81UH%$WI>#7R;94AnrD$%_&5J9R@ijNk)mH&4UgGR9UHo%fI@wvG zpg%Pm0`Hx3PNDGAm!J0BRvH{KL*A2Jm!_)eJ)}mVm-VNT?)3;dOk6<2{)pq&XrN8x zwxxqrZpTghsDPRs6BENhah!BdDf($m7@Tvq{*@5;i_pcxz)n1wkT(oN<-DM~NB>H` z{c*8Nn1mo`;>9GYg6$U?5y2S1vS4epxQFM$13mH@#SH3}C%_3-m{9?PcsEb!xzka~95&5N%7PGHgslt$-Asutt)U zjg**~iG`z{iHWIe90b=v5m(bLemQ(vb*FbjQ47n(_KnPQC-29@QZmD;(_6=3^pdnc z59hQM6XEPaL|hr+Vw0oHS==kqUmhft>N$AmZ|RF>W}*Iynz_Q~HProkHHsz;Smy*G zl#@uXyh}G}Y&-p>0}Bk145^sU^})q-0jR@Bn!}#QkxI3U_0{Zd@#EZ-wpQ?kvEA=c zj=%dPYOC$5ncTKTYFvZND;7m43P@vN*rA|j*WsKAd6^9B!@1-ccv3YzN_xABXuW6{ z{%AwiFV2iau}WK-@C}!z`xOt#tS-Oo)N@ISb|PG+aSXj%1Wi^pd$yjU#Hs|y2}~#} z(}E00A@}=WpYo2$7%OU|-AZ38^CPtvij_Z~ACCp3ia0N0BXz`Rxo%1f@FCyOMgjbiPR42V>7BE+kRTS!iAnQ{hVTRQ+4HGxt+z2n4<04V)wH>ddmA*#5!N#ymw`-uc3Vy4moE0p#_8 zpI+eQ9y&Oi~4hUPZT)&Y8jnMmwePJOyIsz;9?N%S_)2>lt|sU-v`Id z_LP{0Mne+AOM?aWR5IE*Hn}KiI?W%7A4I>y$j96^T(FVtqa(K#nM$sW|Kj7ZYxiT> zpK5TJ!vxbAdcRh=_Y5?iG0CngoKwRe(q$*f8bv6^0YMH)(zVv~(Na~`v%3g|{yQ#^ z*7`}@>h2+Kg&L!`+(FVHag?yxrTl722;?H+vERX6;7HZsDgU^Sic>9Lv0hH+X z(R}TubF7%#K7Z{iiTGPN6J=kw^aQk0cqp)5aL$JagZMkV@0OP}SdV z)*T{GmGh)MTL^53LqYTYv>hs+X@B)0I@!6o96m+X;L6|mVm^fAT&ulWC}VQv`m}2& zD?{C*%_DlP5Pv!<#Mp&qO}GUw^Z6lLWx zX$b?r@$|B?V)&;nvbDhjl9&7>%J_xC5+TXBFH3|)TUNsDbJGKv1ny`TXd2%#CSI2Y zdNAQeM?1_~S&O5E8R}$Mz1j>X_id2u@#hxJY#p!Ulw-LrOOkRzs+}RF*jJYEeCY*Z z;M*zE8cc1Eo>UBYP^m$Mb|-)-c5 z!MZQQmBv8OU-g%3=&;3$ub1`XcKWhO^y+Vj6Qr`bNAZ};$MD48>0n9D^U zoA<@M&IZaoRRn=h$aVy^sZICE$;0a5nK2|>qJI0jd%om1EKhAGFFRSPwa&Arww8Uv z0`~~&HpRA|&GD(uud|@C+m;Yk80w%&<-uu%x(M4jy(^<$Sonu*y|tl(K8~OCcMId+ zCOGTf%VG8Eo#5Zx%SpKx?5C{W`0iZPFa~zqAsDB1bllJ@p?TmNmH!ZZ9lX0ssWo3Z zvyr^9jei+{S||{v%rb^AYR|-tqgH=qEBhmqOCb5(Y##O?h^~8-JF^dg^lV&Qhi>`84O3?ia3H4tee z*(EJ6Lk1+`+Szyy$<6EDGtM5mk2GXXdx-27$|apRGKkhmExP9L@RXjB2m-zuaGe(y zGk2-9c4F=F8f$Dy%OEmEYJX-vIpb>$Jz$k)@$!;pK=@ICY1>;Tr|a}d4OU_BFd(=k zDiy!Ep^Tq^;eCzjyFjEP8=b9)#j+Eqm$Ci~m4&W30w%%%l0t)K0fgPZsVm z+MS{8a;~wDi+v&()E$5CGmU3@`xkpzFeF|=JL~8zg5JVUdnSCVFqKH! zN%^I{nISZi0ZlS{x}#NtmKjwx(uM2Z+xl`unj!j@fMDz8Df2YeAgZ$O*_KNg#%EA0 z*2TG$k0}@z{hnfiOqxL;1XRvotzP(OltQ%3!?TM-=AENXrQcnKs$Bl$WVYe6W8u?C zSec6i5;Eo+aNnEaNk*-;kebs9>eEbKl7mcHzpu77GZ-9)bT_Nwf>2sO6gM%NkYC(i z-PoBuVj}W4cHJXdvMovWXK7ADNiGM|gblY1@yijQ{t|QXzp;@fRkn@Ls(tqK3K1Z-K+hV(2M5zoZ ze#g`e(qpML1lOTEQyy?l%sgw?ibB3n#;ZhA_?{53^N#;8*OI`)_GIOFT|F9cWq`3D z0R0()dm~QgC2iGR9MffBNQmf+ArcC?c)>OOk9KN#=4!(`{ZN=so(WC8C>0 z``_AH(P+e5-kVBJZRy&%?GW@M8|A-J{J0&IDU=Y`*(NxmAxYw9xQN}!;?k17Ep4hR zI$!X_n80h?T+SCz7{niwYL+y0PH~aMCC#bDp0k{b=LDsl+9`g%s7!=nQvJb+{7N+Z z1ixWpW^ymcz_D9fBQ!=`fAjcO znUr#TeB#AX?<}$>JU~{QSTY95jwrWsRLh3IIHcUwarN=N|FD9e#HOnRJk_`T3@ao?N zSx=Ao4%DVf?fTKL_IOT~)~j!C_plbKDSNUP1nuDY3yYIc?ei(ZH%AB>>%7x;zZ0+C z$V_uJmS6hw8xSYIzS;XP>T6e8_gf%Isc9wI>Qb|fv*Cs>_8Q?5Hvg^|?;pujSmTb+Q&)f=GKqlJb5TUAY#?@WYHxYWex z^Zqn42fKA~d+!UQy~}$_Iy(~O3SW2Kh3F<9&6MPrc|kbQi%M!I4@X;R_4r~veP3$h zrN;9NjM>v&<~K?;9A2_PA*eYgg}(bG4Qd78gW9i4y6#M5V-{b;R9k?aO*CXs{V^)x zwsHyY)**ObrA}6OZS5hPm>%3_tur7q)ld3g15OXu$W4Z{U7i%%Q7Iuny8>DJ2B3-)D zXa>Txb(s4!)kaYLzj3Nds@Om6D=%F}_<%2g_Tr`-sE|j&Y3k^B-RM6l;V=8o^3(5q zKQUPZAK-&zTWn!f7{pSxuIiEFu-5L4QsyFFYlrYZ7gHq6OgT2~u@6@Nu3d@N=Iy7X z#}@7e+?O)Wc!Yb(ly;{jn)&s_yp9OUgbbmuQs?N~{S>q8a1&%=)-QzoNg3EOjV zY2>Y2$YeKUpX_=;YlAp1QVuS>k#;Pt2}4MG3EfT$`%#6LSiLxXf-8-_wK3wrdjt)J zMm|4XmM!n2X&ACeTV@jFS#XrsB6`Z?SbEn^Qiru(C@}0yO4=| z@?%nNthZj#_eVy@ZfEzBM#i~)VXOQ+MMs9u?<*nvz9_n{ZcF_XD3tJB7(&hdoAqCSn7a2{Nt7q4E0nMzoKdmq0ee*VthH_8d3P7ISt_{M z2Xkm%2N+y6G)BZlT3a4A42S9FE(o2fo2kE>{V;yb4Xry;B#49LJwvE*f5t4h+@Pnf zFgNI6@8xl^u*EEla(q$61s`;>l=`K@yjJj8EveDCJU7~`qW>a9 zpBZlwMFlfrMOar%Fre&_TP*9c`*H3sf6{YtqcFY`F|H@pVHqFWCb3aZ*{H@4xSLq z#8$>ps9)Am#uS~+oH#sI2ARl4=n>}(K1biMg+g^qjap$%r2n-_-q8K(9jwm$Gf(nY zjr(2E9W@oRd{%s3k^4`hK-YeVxf{j)?G~0{_WcS&3v??uN^Ar&*`m(_xEP)Ap$W=M z)+`1p)mb61d}46NDK?|25Tkoi{i8?;skpe2mnASd?fOCLe6~9MGkWrorTA`C;$hQl z+lCBjBCBICu@Y&Ye{~+@Y4Q!F+S3&Dr%utV&xA$e;>z}V<6`O|Y8InrT$X^!(ud#g z&+VcWNRPvsf6KMVH2Wn8@fYxHN80sx4^u`CU~B#=JpdepM<$TJ#AZh=b7ic8KYW4u z`>d!l=IPs?4MM~KUts3^7YkOtQpS;rg%6&=?TCo>IkEthH{6o;Ls(fB-oPxE|0<^q zmmC0ZVwz_v=A`;f5D+DEUv+m7BOaTo_Q0qVg{AX_wndwR1d}h1+pvIx1Hp8&T)4vH zehJOJK~)J*YTPf0V7?FPFu|m4+?9}kFBCuDMxq@y-lR-_uC929cF82!f2+T^jWm`R zmt~7Qe}M{A__z$VoR$;R#_;d4-1Nx`Vtg>)tSpQNWm`)*q)^)`w}{_Fs{MX8_gHp# z<`>A397Fvxh~7V+*YEi#gDPj_9NP*4ls26u<9{rww2-G?y24{0_(OW$qG(SM1R4L1|GkYTD%B z+3ezqYU!C!lPH11=LO+QUKJIn+(7hW`|S;K%2kBfN3F@e2xR~Mg{;@M-Sd&ZMm=0`_4A6pqY<~wVK~cm9mc1Iwu>&L zy@KnUKM7sxG-gXUQX%TR;?cWR|U;R4?;Y9p{-nx$?}q! zAz&&I5FC#~{HCeu>t+Tr-(_En3|3Q%g#@ESC;w~|%(2HQ?G(JC>e+Js!a#(}1*ECu4yIN}9z7 zV3NAQR3%a{5kYw1g0lc+CKm!?9PU4e2%(i!7+~jg03$7c9Eyhk?0os-TJS2PA_uO8 ze;`0CVr4YodNGgolL(JPLwv3E{yqUF<7Oa>;Ox-YcjrNNA9k4jTO5h^%M7%TRmUUb`rBJNlJ?NlD zik4l4BZ1<$vS+Sh?(<^esLka1B2tK(@dEC7)2!CoLF&SO7#Zkb1Nn1)Wwp@JthkhC zVV8~pu|eiv0jtE-r{e!;Xd&$A09%QV(lW;kYNTQ*Nc+4QE4I@%j;c2i`%7>h7H7@F zk`~t$5BEghE>_qHj8=6ayIBUfqo|PRW2D)sf(m_vaxfx*8B>_x+)3mUgZ&dU6;Yx5 zSGdO%GGMg{taU2KiZ&!4tnlS4=GI=l1*)Fj6*U|-0Q{QjhgavQ3$MrgZ}S0e7!HY2 zFbUj6_6ruSNPecgz_Y{+eMndW<A0zRAsf?2Rs|PPjUDH%B6wnFS zO^#~LBuOxrufR4fSEZcOLz*>E`2t%C5|8%@Fk@r=nAtC|F=8xLmX!IX`j}GvM}(j= z=4(6o$k~#`N5XW9gpVYGxRFE^+=)_BvV)k!5xb(Mq%_L0Q}P^}i3xh};J!7@>Q$|MW(dAXedo|zj`4yAURPy?o%ofL! zU@>rf<=B*_`o}^mC%eScPn#T-z|+$uZBaYaH5f^8DXvOpMMo6pm@V~^4SH?#luRaI=gA^*u%rpFI|r(rME@!*UaN1u3SYOm;BAaxf*Z)35x%d zfE|E!n-Y>-NlczUo&l`nIQ6Hd29jC_0DU}{`&HQY(}%Rw6}(&|y5I?2r|HTOL8Aic zS0717d}cCIM}fy{ZLO0^thb0c)W2#+>$S~kTfHOG*y5H){s7_g@!99pdL*3;eEp4w zb^EJ5-^{>frh1HxD*xFV|4AlN=im%QUB?SN2Kp}_>N)q5*0d|ceN|*8TK&Rg_U}0n z0Bl~~?+)%s5uEYgbsQq&X;JHcVJ0)JjjJb8a;Ul$y9Kwf+kU*Qx$yWl7h071lz2@8 zDS$uhuFV9VO@GB=BrsViiQtSI0S?yD#;Zt=iOFy$n#TFaGQqQ1K`Ie z_sL?*7VuwDYQE;f9Z^2^q(}WPmB5+*s!fRdmsHEtNBU^K$A&}NwREVjE5JgsxSTVl z)%q()=&1?(A99VQ5pIbhFTha zV%tqx89mwiY}cu%k`|%=SDHcoOS$}=(`jfD36gXA+D#e*gS@HJUy+UWb)@zhYPkWA z@rbJ?JCgvjteedaey5YvQ8=s0g!w|2ueKN$NKOOZf~@m8V!b4O3677>6@s>w#b;nva+w*QBxK}%ISRNWc&&3iJ;&W2L{L)s z^awpvvDa+Kj#H)UE9ac&J@+mxjsC($HoOQ{==9*~HAwa2O(6Zfl-bzPv?8HiaPqY9 zkuvQs2JML6@tI%oYRIc6ueLuaih1AUyT7>`Y@`%WRPi$M+wi7X3$z$*(7|_MZnY^2 zep|NVz!qti$0xjnb6Q+uhm$q?cAO*6F!25Roe&;;A>*RrSh?&$s7*6xOA-C2B$R8} zZ6C_o#oqS4?PU20WFf6niA30qa=X1~okkHC4ttlKj9)Dn=_xGi)}J!d0Y5lkqX+I^Jytb{!pw*9Qc>F#k+e3d<&e4K*T+z>sPjwQ)QC|VHcC%*lGp3l8J1d^3n zh{%*_oV*}>T{o}pzmPiTik!hqQssA!E;tx&Au|ywQ;4=%nOgei_Fm;}(c}M}DCZ8n zuy3sXTVXy#U)%`^eIs-M9u<=4=*=Nnl5IAMJ3}W^{@%7W{=LNpeXw?Y@Dg44E~WZ6 zKsc-F?OPcs5^ZrDJ(olQg3_4uP-cg2rmr#)kE~0{dFfZIw+h4vfEmT; z%_p0g>~EUlMM-}?zTgCjIv((LF19=Js7$2}B) zrieh+cHllRV&^X4*}q%cv&GX}nPayJP#@j8ZM+iE7ePB^#=nRdgDdz`%jE711IN(G z$M(jGOM&$C$nQw~+`BiZgAU)oJnvF&&od#>HTJXCr7I!ef=P%hp<(>G&0(qKbrLRD zV(gii@it2b@uKOGSB}Pnm*$vdOMiiU^^9~&Yy)B=ygKr5R2ttleS$*$7AQrv14v|xsA}2%Kc^fen>`R z>W3P|A?~Iu^!I$5?c%W7zJa}#5wke9WnVFm;@5G3x3m)v0+*)66-j05pCFqKl+NoW zY^cJ^L9gwf(mS0kyqI0-SmX3Z)NEr=}e!r<@y zFtK&)z5W#tSfEqGUk6tvX3$w)lm9dizNdsFmf(@M+H>TRf>vJ!#j_drHOc=;d%Bpo zp88U;>iSx328A!`cN-27Q9BUQ5u9@E?<+5*)t}7MnHth2Au$kfr0SN^vx$ z_!ac|kvE_zC802+f!;cC&ft!WzsUw8km?v_(ZM~(arwEthG%(ZG`c|ocOZzqp2@9& zvG%Si@`cI*DF=cJ`4iS(w79sjktl4*l-8rY!PI`2VgTdFt3=1#VE;Kkr_WQ;lI6n0KWfMaucXi(Rg+CY}XjmPjQ|wN!nxVq>$rSca zF7#e?gU9@a=DvZ|u!}aP>%B{Ai}sQ_VY%zEB&E1n^Py6J8>q(PCK9xt88IP`V0aCV zV^=ZzQL(hOIF(KqVG+seJ6<6eKG)nZwF@A$Z=29$tz{gYP?AN|7c`7`Indrl$xw zzqq2J#{SH-J{UsVkD%|5SDl<)YVEvdE814Gm0>+bH!i!eN_m5Y`avXl)#s?$Qr2f( zR*Yg(PAbRGE||D`9_KSs0HUEsc=Wv2ynPaE!$(HvUB$8wY*nhPyZQw000vZwQ%l?h z0uT%!-vI<;l^PCb6(_7C#;Nh7&$cfVJ&faY+Sx7w|3|s6H1CeuE);)1G3jOwgDv$M zpQeJS*qs!U-N`T&0g+%d2?uW*$E2va+Q2Sbo}MS$FPOy`ge1WX+-(|KXcGu3xsM^W z<`IMj&3LCLD?t;y;$`b+&kj#N4^c#BNMw_h9b;7gL1hU5@*)@ISjd7doY#tEXaJU% zHCgnGtk2+~$D>(j6X6r}BQU;Y_>}rTJU62nk!7?c=^C5JMY}wn4JV@#k!OsRc^D)= zh(Cn#-#BTdN3Zp4@gsY?SUCsH1{m-M3beV>A6y{19dDT^&yA7v=;t&v+q^`%j@Dh? zlfdsoaPqgwBepq;DnpQC`;|(cds(i{o2SM@JUb*G%5gD@ z)_$8Yzl>R6edii2kfh-GSxOMOpn0tO)F^SB%s9myLE1Z-%Muw9GnPA_uR#uKnYMyO zz7%Z;wWM9bsnU01Xw-2~Yz{+PV&UPa%}w)XegXfLS!Z8`l}NhO*SF}Q@@6F)oCw0C>$B>Mnh4UP|4mY}3oQgH!v_!J{%* z8u!b!ZY(!0qA0znP#4)0uh0;o3~g5>dp!$s>V5ep+@;3RPTG~T1a0^Upo%p$A3JEOn+-e zP89#$d4TMCQ-0mG*76gQM7lrCJ{QA*9s1PBA-(JyQBgTbRz*-<>)cBZ zfrIcSDgrSIT87d+1?@G3R6s{p(fVnd8I7m*Q=054-B2-aXl6!PJuk!vZr$(km7qNs z>xmv)+Ea{2X~)KQr@xynJZBSRK}`tnlRp|{gBwSY&O7dy@6C8I9(1d27*ELpeV&2n zELB!hgN=c|mkCt}_4W32#qw(&JCI_8iP=OKt#}1lp|ff-k#G9iGr5`xh5CVrMt2)_ z3?U0%0Gq^T-ZDi0Oo<|nDTKR_T1|oO9jOqXuVX_SJtruyrJx?HXL*!Be)ad>7(-;tt7j9)o zDDw@MXthC+P;Eiam9J{NylTHCM^H1+H93}l`T0G@Q&N|@Omz`r_=BG31hQEFJ5WBnn}ugf~OulB|qJpL7HRrV?VQ zv{#?K@F~64g*Xb_PpS1Eyb^y`(7@nkdAJNYXN__c*0~<`-sLb7Jsb`Gdy8g`n{OxI z`n|)F(3$?;4K}sQ8_jk?!u1exqY1;-x>xPj>9SW9)A1NY$;^AKy*s*yg}9&d<xKAZoaUS_cRf7+v801w)^@yJQttd zEx{p2*1kF8yME3J)Svfl!_H)K7S>mDp><;G$%V!~;JbOrn7q<393^+gU(Z>34--%2 zQVGR`p&K>^hdN_??>iEnGFJ##T0Jq|>ohiwGNe#_i1oTmmY&abB1CK0dtR1hc;jQO zZ=4hkWl`JIp>VFBietu1`AP&l!-oZe3);;-W=jx&mW^j@;o#nqavXq|-n)_YFPDw{x^rnL%Ep$lVgLsVm%9ALURQefG;?EEbCe9u zYHFekxyKe%K1YMvPHY6`Vu3>)S+5c8jM-l6ak?Ayy{`5$dDe<%f*_!nYs|uiR!0by zZw~>|%fg&>sIS9S?0P=OhQ*K_h&CWYaCCT@Z3ms)5zN-kf9`redJ8pyWZnXz3A3j+ z2Zp)+cIf&hUoF09>$?}N6&>Qb)wQo&AAB@bthOKY~Ox8ds>(rS}N z*K$gk`||g(je)TQ&Xo}9y%bV=nn%!X+p+thYE&o`%xrU$b{MIA4>gm`d@v7<-hmjo zqxosGd-f5~NSo<8&1sEv6QUp}KC_|e(nG{v1*5zYfOtZIqc{>QPsiuZ{M@Uz&O)|E zvAJ-U^Ua}2A=&CB|HTfCd2y4K@=HAs_-XIglrAWSWK|X=9J3iHEVR38p}r<=onfcqMVL`sSkvD^lyM3qYzhbSL=y2}L)$0wxl z!RW{QxPfdHGhap6GHx<^M z{fYHCg^Zk0HhLi?4V(zbLKeZZ*F#=gxId;?WpcAst4m`nyZ78Pj8J8F7rNLSf*$_w zN9^^*lv?ntUzZO~*WQ~1Xib~)ZHamadQ0qQEq9fJPcIw9orQ7-z7eQjI2l8yGDcr9 z3jKH7@$pZmqaNt}VJHF{F}2X#9lY^~+<8{SatU@sEZdLya7zZa`?GZKc6s27`?3g> zd6RR6z99(XnVvwAiA*$QRL2J-*m+fTjz{k)1=X{QFu6Kuc=$>3)@7pk*GyNx`N2{w zys6x)h1!b7Z6XC)bStB`orTN_&apG~!Rl%H*O(jtk$!6YI#g`3m(1cU_CM6svRp4m z3UbgJ6Fn{Q6A^3w#!#S+%1_pg1T)#1DH)f0^NfP1p?%&q-KSS01juQ>>lXzzq!xE; z9L?|Tu5MxKSjAol(UGMs2XK6QQcZmq{Kx8cjHPGf} z1mFo%1Nkr6iNz5FD8%gxpcsLdr1_>O^Af=dKS7Sz>Z@S+(5Tq57g6@2)xq)>-TO_4HAeAog%PY=bCJb;MxRL_ zf2M`j?-D!6;YEAU*n|CNc)3+IB{ZS60PbEbXj3S_;ZJQ> zlS0x~!^-J~4&aCqQiVgQNUA`Ux8VLq|jYaVb1g8PdSZ!d&Z2x2livVCv zw!8kwMTK#e5X^#yB-l^j1<)ojm@a@#7syHB1OcGK{{buI>qR}t!9=p80e5+ehzMwl ztl;g)5fOd;D?gIYU*Z_mb)3#C?v6(kTM%Z+;!lF)N~jS(1Y5#wL&rF zrOJhAcO@Z$B>bk$|6tk(?EbKuu_91IqpC-$j-fx(#Mm3K~Wl)F;i7Rlm)ASBui+mw8CNt?vrE+|pczwrxd7RWJ`b*p^ zQ$g;7RT9N6W5M2RBvUJsNO;Apv4Rc>C?MKyJWc|X(&Ppak62V}Mio9CBm8!LgHd@k zG#>Td?qGP?r*78uL!UdEL_KBs{4)Am zl>e|=j3VN}&jLwxZdf5NN4!8PZyU;&a+{Q^`EmfV^yNr(`@BT6i?OVaNz zygBF3t#lhsDl<*%HQ>UqTowVDyKMn38oZLFw=3}d8#=Vjo`n@PiaEXMID#{-i3N&; z?6uD=os45E3rYfVDhrsdN|r68@<#x=>w*VORo;Q5fdv*+Q2irFA)GgVlpHin#BpyV zVSC%KA_xluBR(m~i9B-GJPyFzHZuiv?j%)C2Xe^B(dyrP>g9j)1KFuF;7D4k*k?&;xCUss3E z?Q9ODzg~7jyJf?DdWaTyPHK_lVga5;WvmDAHB^wf@lHwnE{Kwyzi=ky=V6^}O5LImA>9H#uu z2>bKA!Lh2oK0*7cZ*2MWPwn)l+Xp)q=^mVg+1VQ)GkS0gx9+nyE!}WQx&c*|oz^Rc zf{B9K9P#Gq>C1{;rvb!rsXD`L-tqBqPfrgqbjbmbQT=h6l$6lufh6%Lhwa1Z+;5&o zG^kATRNr0tdj6xEKQOaka1Doj$bru4EeY?p2&lsz27kExi5d==B z7Xu5{k~r2WjKJyf9V)=3BHpu9xgfpgsjt0;Jt~`1? zN$n(PE*`vOX%CkM=y-qvPvBaV=_a0Hz#`oWw59fWr~@!7xHN!@@f}M%H-Wa)T7u$j zJ<};rxgP^8_m8RiPI3j7`{zK*nJPsCKt}~Um;|&)GHhobu%1|mX~Edcg! z7(K}M?;lLa=Qqy3zF`WW{-Ia^{RbL)vH<2!8R|C$z#5m~Amj_lMfR;bq?7%>KTt7q zL8p+yu&t)9hpv*MfVs0Hi>ZaPnI(&lqswnBfRK*>r08hrVM^xX=-}io;3G`&7lZ($ z{JWZ!g6uC44|`z>T_sgA31>G;G9DH-7B&hIWHK@`AvX&v0d+~~ztJIo2~*g3c(@3# zvU+=avv_l|IJ;T1vh(xvv$Aopa&Rz1Aei0XIeD1+Fgv+Z{)@;TI+B*|=5Dqw9=6U- zWWVW}nmK!V2vbn}X7ul$f63`#YxNH&C-=X*1?eE`?-o{e7B<%Z3C+^S_WuU$cgw%f z{_5Ai*a`h6Ch%6>(%sp?^S4+c?0iChariHl|H$_*gn!_*oNPTrIQ~ZYZ`J=DOZT7l zf7twI<$p&|akI6A4CKG$`kTjp*8K+bs{aoC7oWHP@ZnJ!19g;#;E&zEJa{wZciM^Z=Cm^cR{)uU0T)|JvhQBU zN8DDvF@(F!MV0g)ev-6e z$WeRI^f3Px3c~RK=HL8+_h4ng=)~-n{|E&cgn)ne{eLkCxSw{#2=HTrb1Rt5Hs!Ng zIcPd#hpH?YHq6UM=}D0=lG%55;uCPTtOjrS*)Wt9Vg0EzMHpgxkdj^4_}pTw7QTSQ zX);Hskt1VE*hBP6g>v{adUGo*hq)3Zp>j~5+y}aacR#DCAY-zo3(W&(M(W zjp3kRP!QDlG{1znzT4*1naa-NR;+Nv$&t4=fyRyRB;}mSu(jUsr|+Pl5(5- z;IV+kF8FG7s!Wx+6$<8FKRK*%oF}<4Y{G*Dr^h}LP@CJFYf}a*_UD9$ReUK1AD9Bb zkRb+t&{)VNQ%%htix~AZO)HsE%n#qG@(12uUm-qcz*C zcf>DhcHQ>SQxlJnXE9@f}M_ zK~kt*M%f`;4m zi%Z#tyxlZ5FBSm(5jdTB??3WqmD32lXv?W>m!wtXCT*a%rr!ZQNIb~9(AtzDgGE5( z+#=5R^`4JGE#{4{q)W?sKVx-lEpYr?eVF+4r3od`!N^a!JN5DTxWDWnkhfxa{^jO& zF5ugvBdv%Z-+qlu(5yV)*BcN}2xNyGIWsK(Owyp4t}{W@$bNDl%ninII+qa)l~Da2 z*NLsgmkcUQZ7UXTI3lW&^kktBQK+oSsCx|9oGANf`Oj_V#!c65a2RfguWSdqP;k6N@+GG6$5!RKTiw2A+1# zZ!8c_onRyuf|P4{;LX)_^rNibGfP-Xp^6#{A~5RvDRd>}pQN`w9>u2)IbUD=a`(9e z8=o3f%vV)AG7;rgbK?Q=qaLrXq_pd~wDW2G0y&0JePLBzHj?VJjd7XeNg4Vuf5uoqf?+j-- z{UF~vYzSKIzYMZ-_8;x7;naxhTDXA)1>Fv;#Y}g=OSP8J;whdyt)kI>eETw#y-Ha3 zc3JOL-sHg|f57>`_TZDnGG$H2J@beA-67#~$M!qZ#ITX{Q(f-ejoK?6+Rl|uON5Uq zR;1na8!O7)dY)#o~CgjOSE14hN$$3?!Z`U^!m7Cb(-&+~lIQ>dXTsAk>Z7n=Lx`DF%u<9$8c z0NY;LTZH7ypz^~*QXsz{9g^E_@BB?|UXaC8+irnBHlF5}gQGZ(g5h=IfY%Mlr}{~c z?Y{HHIL;@#yFTYW!_LNr<(1XM7@rR!;GWW)cU{rRTvB$Pjh%ECb%nT^re|6n4Ef$) zj7?#QgeWH)uOdWrwfnQZj$=**2K|VbO!3bW7w9XYOQj}@cJcXD;^Yqw9wu4xI-D!< z1+yKc@c0;;nPKB13d&Sn%B21vC(d<@6q-TqVab>6Z;YJx(y|<5H8#L!uW&SVT5QO3 z-7c9$2NtT4AoeB^ayEGCGOF}}BxWibRm|N}*m{;0-bQjsin`Mc^$tEOA9cp3gTP76 z=SbCfX5XFWBdhKfE$QGNRJ#vcHd+Y1w=M;L0gr^4h@h!N%N$O34uzw_Ke0R)UL`eG zMXlDyIG&wOe5}n~N!$2o=ePC7DygTi748nEA4SUZvRNjBT*A)&aGXf7-IoS6iAR%t z2BU|JNOk8})PUs1f!1JC$I#I4tR*Lg*FCwvz*TL2!g`}nS`r=q!6Rywu=bPC_(^lw zHMN+$SS^LTxYOder;d#9;>l9BM_?iwpm!}T&*-!5Sljp$G|D9dA7V5dDw2s>dGBm*})JoQbT2aWNc+2PEm_`#Dh3a3w9{HTjn$t$uEcs}zlo@Bd z@>WZ&@wN=%JohW-JHjc}QAW|TLnU674ypA7;wR0AX#~vk$N_zq<(ACm0B^Fzlapzrwe9eci2k=W51(b`h*m>**olvypB{hw#KV)4D5cZoZjFu4otf)dY^VE1 zB!Z5bGoSzQ)@H{qXi_SCd=cyTl>Wko_MrZ-kz7A2MrN9%3nj)n_^dn-=#W@zph_hRoSY{HzA>e z7%v~tpy>Qie=kseVd`otdFS%Rv+%UQ)?t$}-WQ*aU~uvDi0(iD1Xh`$PvQ zf`nE&a#HBxg?y!XBR!!rFkX9sm(E17&inb?6=Jj}7-s1Gq(YqGq6swOnC8f7go2q( zWNM;uVr5Vj@MUwbKCSvL%J z2XE3A18@pzxJq&lJlbtPxYs;A70K~Kl!!n&{&ssi7JvvZ zOhR#Tu;@W|PsiiW#-J-;KJN|_Bw7FA;66U~f)VJ@+2*jrLZ}~Sn)D^|?CxnA7EET24<{G*TGp`_!#Hl? zCA6nT1f}NacuVl4(0%*nhFv1fTjP-Z$G`On51*deVB+0YA+t?7As$C5>eIU|y8ux` zY&pK<2pDE3MbRT_hVLgv^+aaDp&!imlC*O15=;HxZ48Y+h3a`<5#yf8tu)=gm8Rv* zdC?aJt*pG7$mwVizUMUqv)^t->Dr*-i@kEfpZj|KjQ+JI>GUg%<#yl6ExU^6#$bOh zpx?5Gx5)HOuqX-hqt>(3mn)Ec&gI~;@6peDkFKDXpLa#{aQv2Rw26VAg{$+5K035) zw28WND6qNoJR;@`$LEqo@+}WBjv(A5B3Y-ssZI=lW@Z{YD?h!Z$F&T${rM||YNe^E z(V)>D6C2STg#Z?vi^0dL3{PKA|LfBo9nXEpy3^8v-gPWeSS`#LWn~eXB&M_v=?f1= zkQ#soe}F)oc}3tF*heibT20Ggx4eqhOd3CW?bz0Ew#+`v51QQEKGoGW7#i@r@!B9G zS_~w-AC%BI+|3euzO8Kqc0w~%k6P8e29kbnMUhEcjQ4nsOAJ5xIr?RWA1;=T9{LSZ z6!&GKuHJmk(>H-zs(j+F7@4(%ydH1tZ6>e;1G96qLiKUBxdkyl4o;FzJjb?No0}xm z5(aVmO&>J+xUr{`a&_TE!rEU8C9*|x_)X1)={tiwhLhspR3_2#bEfgM6@|X(K8AM; ziweQ{B=Q9b#eaOh!)JNu2m%O7%9&DYKCTY2E-&eW=>r2EclJ?=*w=h~e09Rb-_6ws z2|rBVm%)F_R)B8^-5$>fqnmx-w0kht4z8_F?Dh!h>78Iyp~R2Rj5_HIIfs!ZE9aZG2f-?ty6RH53vUFGX-V);x%O<%(~%$1Q=VdM2&NA= znJJ)$9FkCKD}%$PsFu|Md?YM&(fkk47ztN)bcE_wCEAQmSGc0YdDQdz zMNeNXtH2cO($L)W3(d8bt}0~1{IvOaXHej5etO>Y;>EY;{JF}n%P=ri%Xgsz9hWs> zDVQ9EDVfI$w=J7`V)P)-Mfl-PTvRD*?yg29QrrM!Az*Vq7&k@Y(T?p_kbWYTwI`|jQ3 zM7#Z(5UhIk!S+@;Rk?M#lcR&xWsM;hQHYNt^xiglC&$ecaW=?V0tPY6@x;)X)(Dfs zni$fJGLs&4K#SnN7MJe94^}so##BKqC1+z)OV)Q*=tlFBoD2DeU+&*^gY>o0bT7iu zYBhVeRX>a1&L6C%Xan+(<2p5HLtqc}6`r|vA6-+n(Y0&djzb9}i3ia^Q$Sy!mnIVK zatizcZrBqVgdzjWnXpPdLue5AV;0dBi5Pdwt@L1VOEWWp(jU;3@KuSQF0Z%hesxb` zG(}qb5UPyMGW;y>Ex`wcU;~L?f1)qi$i?nyOmaAl;5uKC{&GG%!ooaU<>JvW-X6F1 z=&YR;8sK{C`)mKsI4R`0;i$w=b%bWeo?sjhv+w%9$Q588z9rA;<*0GiXdZ;7cxH*l7(5I#^zf1PbMYEE| za{g(;2i+E<2K`8`XxiRA{cwNFj85dU!G4FF4F^|MTKT>y)MPDFpLCcS4#8f1=b>Bp z@kevk)7@Us1Brh#r6%t-(c&4yRTR@18)^$@a3UW_%~_Jn zN~Al2DoRCCQm!n*Msh~lt5h{%I%oB?Xx-2kjswg>dpch7SYX3dSqC3kg{i?-BPJ^( zdE8@OQu=6`MoY=LaUNRhV+Uqwsh#yu<9-jiT4T47Vha>g3$~OZm*GLSZrv{1Al?Y? zXRk&hYjkfi0MNXSy{ugYbm?rSi{R^dS94wq{AC_$TpwIl`WJQ*iU-e4OmZe$-# ziRzM5yf@p(n|HQ={pDRf+$SKz6eEUTJyc$kf*{puW&~en{VOmY|JK!%BLp`o8^a$t z$Ayw`3+;qu05!|C?^_NYC!b12B9u7=`}y6LOprL~Qs+5CNA9Qgb`~}eDrW^k(6;R!HII+UlnBdy>IiAk(KKnJ~J6{im3EG)4z@XhJ6K+>(Lq9?dxa z4EJ!__v@hYIA}V5gy?v-ZqN$uD+OKoxxYho-_3TuSe z7(GltP+Aq)d=8 ze%v*@I$XA-ot#K>+CGbbk;XtrITDS~Mb1PAR=D0}RrMbz2z@(}Df9WjRv3eTzM~)h zEwgm9(F+kl7Z3Oxh{Z^*3KtE{Vp*LVY!rdVuN`3a0SAPj!7$0jHWHIf`AV~Z2+)6! zm&E+&bTp_>3xdBJM9X|gj1OkGo*oP$_x?`gW%ha>V#Hxw>%f77w~ez>2a28+M(^9u zwu!(ot?0mwQZgbF=K*M4^j?9SxIuOJq@+!GmI8_R-y2{sszr3M3w$;HBE%*!`MMB! zUlX2zMrtD#Ce89_$_2j%LmD4Sfc`v5exKw7m4mQ?PM|vbaqs;2tU!fg$rt1m>uG`P zYV6DUI%$kDuS@Tj|C1GdvLs^g`bZJ4**?~@@Y0>V!5pE~GkZLpF1;@*s^7cC0`5y7 z+Qq%kP`sfmD!6R?_~9@U&v%Kncj|oO{mGGC)4j*LZ_AmD(y(>+7T4m{+XIh`Coc5n zg^ZEV0lDwDIx<9hd(Q#Jf;ovQp76c_;qnQAmBcjiYBGq#ihG;u$x)xk#@U>5&W*St zX(ar0Hdq-~^Zh$%Uy(lrmHinpF%^fX>5zDBy0!(E)j)$kn#<@>$B zw>l;wlDr={Ra|07J?W6OlzN5g6?1M}K}N*3Ykz2RWiqn_G1h?rt)BCH=v^*iN_XKY ziB6(V9cgY|Uz*xDu;d;Ltaf5_L>AE4EGKoXBY?mhaz8u?TtzV#>YfdfOrH^oS0Y6w zcv`OB;NV3V0g+Ov+x(?=)6s5UV0v;~G~7W`lbd4E+{}ztx>ln`){3DLkuF_g3i2B> zu1G0s^mpZ;fN60#`8bAiV&Y6V`A|!bqiUYrApup_z))oV&!%E){i}%@8m-vqc<#%O z4S3!PbTSB5DXMa60!j*En;@tWY%<|^HNI}X5Sis%yv2rOD#OG(Dhf(Nx@@W{m zMnM82{v*U|Jg^=aG#?xuHpEpT$w1NSQb3Or^Mwf~>|fsQ=wNz?uevu~Kl4EPk8_#= z;`vRMl~85><8IF4fH;#C+dM0r{;Wt4y2Mp2J?&YsY7j z5$0s5W2tX&Ngaw8<99cCbNJKqggan`Ns3=oRV9_7Xjf~!`m^jrDzWxE9fAD5Ie?=$ zNL=XqOh~_ZTsl%6jSp(23q&`1S#Qg*bC1HiTMva?|utrc+ zFojtg28lM3$H2+7^r`T&eBH6ag^|E3nLnQ35v=@dr*tCc(C-9|3H*dv3RLvTK-!qK zGFh~5?OisR{m#_ZF6fkq(TQ@$jJ<;^EMyi_WgYr)U8{ML5^ zao)Op6a&o#llRpjDQmof(^ibPcZxUL6+{rdabOYq%ku33lwRm)m`Gvqfbi;ku?(E! zB>^q8PPNZ9%U|eU#p$ll+LSg$m}}rjlqkB^9^roUvUSdlrky7&?PAA>Yll@*Y}U2T zI^r7Y|G9|FqQO--j3xmJQIR0!MM4PJ+sB>%vK_Vo>OnumG+`QJRpEeggb~JU3eJP) zo~rSV#XuQQFJ0+}{es#p&TLXvQJ2CM*r$EuMFu8gFg@rF*O7pZOzDhN<*Rps;iP}9 zt$*L6iZ>%Gu-(^=I$A7SCgl4b)>Ay89$I0-12#t40j{T<1_Kjlx3L!#Cs67bi$hBxZbK68>tdh0^p8O;u9e+v&hO+_wo zgO(LVH5w9v={tsEX<}Wd==#iX9fbT(j6^uxQrw<60_lpwYG)a!@7v@G!?n2sn>@MI z>0*K{7YId3jjxfj3Y_FH^jBMLN#er8cB(xP<%o3=Y(B`4fBn$7`6jm3#pGK;z&T3m zqhc>qT&UzVMo%CiO^gomLo~X6!$<8uT5qBb*2yXVtOJpK8Ck>U-|>|0Ux>*dQ(s4U zZi>HyOJw|zu#ar5T)Ehvb)=XCS)rx#rJ9s~&{8QOL4%(k!jUEaUXFjrFmPtR%^6}+ zsxuofq9ibPGPN3E<1nw2#d=Trin!W;)S^9Cz}E3(ru!RG-1^VAX)ZPwG^&QdRun`Z zq`|s;SJAj5ZLmfMu2v2+Wh<8ME?+K)8XatoQwwoUfh_NuFY^Iq zPOs;DttQ;l1f4Vp@DsS{eu+kHY`8sl8we_(Wq^k?s6%2LgLH8<8hvCJ1%JFnYq?vS z8U^-57t3oWyaOWp`Hct9cz@!=MP-QfAf3}?oBOz4;s>jkv&w4wV*_;eV=<{|dBcG? zm<=7T@t*nui(vg;p=+Yi@S5jZ^~{_g4iZJN(678hj4D;*V;XaCWXgqg4kT3Ag(7!u z{)OLD>R`WbXAfrbVNq9B9mpE8;BYvOO*juT@C75z}%+T2EU!K*|~#wLhqw8&qyjy;9J58Cr!YrSSJX-xh3ph%WZ^I5IRIOzfpt z8>35Hb@N2Vx5`}ACPgviVt>I0y#nU&c3tiEYrPFx^WAc|Z|O=reoKkf2Y1gSioxhE z*p*ZhfntFu8m!hK$0yr-_aVt;vY&(Ar?Kbx#irxq8-x1~BTSqMwU2Jc{KkX8BK^Q$ z)7wY{{h22A9~LicFxLoi?!*}Tcf!Nq)tZ(oaUcr(w>>{3%bMsNA#`rUOA}*9v_?isDvoi-W@Kjb!*GMJ107b1_s%04fUUOA3Rm% zTQNoE0)-{{B{P~Lru&-L^|J+A7b79@D3|(+_VhNe*SHZC<_T^9%M*e-XH|X(tL7Z` z>gXcI=2BVFsks3~q{m)Ni}$30hQKM%bw7Zua18{BsE|K~VXjhc+{GLR22Ui{d&7r6 z)>SsY+H%lkwcy|^H=b-^<9>eALK)V(-rRE0t%^pcIkz-XLHQcG1+)sE7;kX0H90pl z?ENOEj$#E~LNIs9GiZH72ZMO*M(ewN4)Ru92z=~MfKsRxS#>xc|*a?7v8OFUF`gWm~D9R)eGK3XhUfw7$uuCPN2D$I9c}4|ykcyh{b1&)f0% z4v>HE^sI3)j3th#!rM+z4ms5gC!WJ*c@AXmu2OrVGk@&GWQI$a6USfCf(!D|ymeXv zZ%_5J6=bmnL7m>7BXuc#+lw$-}ekHtP7FKE`l%_hZd5vXh6I5TJ*`EihFer+u zUZyQ)+wi9ihVF5o$PyQ4yjFb8AI7qEPM=~GqA!@|^Ltv+r#p`fMF+=gn|nH(CP3xq znf4Q#sCfG%Vft=uj}ItC7!BnNsF^;z=H;+3fR#ft(pCSf{mo$}e8UCFSC2X>U!9r@ zRqIu)=wRRPTz8y|8o5!S3l>;st^wUnPHeUh&9LG-@TP+Fe7?ghEf@AtQ)|6Fi}4@@ zuAazFMSXmpy{fvD(9WymO{ROqyg)M?TN-``;s|fpT51gFX^ePCmQ6Bc%QYBYsDGMc z)UQ<%XDd${hBGNTO~`rKZVMSq72Q$ijNgju#89Hp{&}HiqvJ!Ymm_$Mc1K$6-Qs@J zpcP4M^TV_*j^MCS!MO)vI&{|4(nU8%5J36*JDUaUkkrK5vMzyqRpWf_eu62Ihh_D| zS$!|MSz2R4P=Q8^6VwE`c2c)3d2K*mbpSYs+8*U7^Q@2hxfCbQJk3z3*zK(%^B248 zDg2iMh|a?9rT?w5@`rTf3yC$9NCa2v$zJlDCCi}jhk{SBzdj!b=_~M3!9Dn=?|y$z zLb>dl@Ga0eC$EIi0}h4pke&G@P_*ZEbSEo0DE*_S|5^_g{YN0u$0+sy`JM`~-IG?7 zki>8G;4SBR1=7cel|ghY;I)pv4jcmGHajaI0@gbPBzlAT^i9NvT_=G)qpPPb1m_E5 ze9~Z(k-eo8n&Yckm(Ss~y zp|3=K_xbYGF2?i69ERu2^LWzw#|lJpDAs_pRF_s~7|1VH3Z7u#lWH9;@*R@dZuf<) z4I#qjy1qZx#(n3vi2f$O<>AUpQoE;;q9$G{XIoDotm~V#lzy3~#UN55=N@fBP7dT? zw&DTV=&L^USA6x~0YPssJ@tOTwPu(%fJ-hR(?0^IqCD=Sc^QIUfVOJN{fX(v1f6^n z`zWaO3v=+p*1F?9J5d~eD?={`jTg7WMdX9v!E0bm1QN1ZV5f-VNnlQ7(A>B6tU?0- z&*F!6`LSs{Aqx#uOVyO&B{Q3eEUupnv#hsTHiRD^XYaPs9P5)2pC^5Jjo$emX9xO( znN=X*P3aQ(Uat*QA{}3wfc&x)Lk0}gGqSUSVABdG8RiTF@$U_aWnRtEo#BDaZ1EB` zBEx$=;1|dAlj?HL=x$PLEdu9^>hs6|R*wEeOFhj?x*Mfe^H{7%2v=h=AaP;RliKDd z)w~D^EKBG+a%bWrB7FB>sJ%UC(UU?8bOq=a%(Wd zzQ8L*sXps|<>d5M2!1SR<^D!#X!hkJ>4?&I#cfy>bmg?fNU}Ve_9z=yRb(D!s zOpR58MX}cQW;5Aa|AjQBF~`+o3LfJcW9hK7fKVgTum;ALHacWrF@Qjr&Lc5m6j=ec zpuQjM|A6ooUM=ni-rP8n)LfWr^x^|KpKb?@oI1}j zwRg~7<)LQBaTiWXot-Xq)-=5RCZ;a-nQPRQ&Bs%C&Pn)TuJ5xuBJrGj&)T-S6>&vG z85%W{R&B2SN+yPWGZwQ^fK27=1nwmdFh1 zJblWb;V#IMLQWlmqphD!b%Rnasg+`?MY^S@>sX{UUNlB0m59YzU?XvQa14Am-1g;E zxwsnig;?goVz|u5Rb={3rJ87%t}ArAC2_7R--{WetDIwNV%Zr{{cT2!Y}rjE6Upwr z%`r@{d67s6HB4Oi+1P6*!?Alu#Sxq`LfrT*zbdEaiEdDjbVX3@;jVP{F|Tn=#=Hsp zepro2FSURu7aC7|Q^ZzQ`8UxQIeMYSdyk|rBn82Hq1e+Q6owNL1tWr-rfk%TOXK|w zYH)!gWva_${f*soYN~1erv|I!&cM$N`oqtc=FqPj-A9+Ietd9opo%a>{(|7|2b~R_ zbJ;!l(yJanF#(kQSu@pb_%kti*i!YZCsYI0D&bK#6LPqQd0M!YYLjOu0gk3b1@Hs#S!WSFDvt+`p#MeZO!snlzS&QLO zgS9vc7Pw0t$?zYRw%q1>pfq?y-@8HGdgrV-elJwq%E)kE;5S(DT^leb=ph#NWBnF6 ztioOffnOg>g5$*v>JBd3SX{6)wS2m#HNV@B$vkwRBGPQ;Lqf>VO&8|gj|!zH`fil= zn69vxHF2Jvao3W{-mA{axOaoL=K3k=I)~ogHX0`_g0wim$t`+qs;48UQhS>_JpFMc}ZkBiVkXJwL zLS>r+Sc9zCse{V$x0@q8>@PP5NF0^ux#mG+2h#7Gu^8j(ffYVjsJdR%hNLYNlwKP` zI;7@%+lAxShBtB`WAv}L*HAtUao9Y8JorW%y}nvefdjea?-UOlD(8M%dTIFl}Wa^I5Y z8~7;^4LWZKfqk2#szfb?^bOb~>7Dr& z%}-(yYyn;Y4{fV+EjrQ#neqk-+r2vY^hkO-SS%P;MgE0&ln=0TgBw-mnn6Bs{n^y> z_1pYB`n@fgx3TyLm0)B4HE=pe0Y<`lm9edX_nDZ8Bu>K`ax>m@Tz^gIw&m|~&Bo(k z7%?KYOR~@zEF7PZ5V!TFz7vNhxm%fJ;XO=;A^nW-EDd-NdL%_r(2Y69A>~ew?m*)9o6=~AMhq~7?7e-J9 zC(RKGpUzR9as|Y*%~it8tA_gXR#}lcB+YMrRW5DfbsXBSsjiVr@KzcyDVHLaIFeaH z#=+=+9tYpN5{iQ6o;s=(zH^j;`0CWd>LBeQZX(=?_O_xW7HdZkd%bR2^tpf+QXl-9 z5B=KcVoXASFWh*RAep1bNYWzZ;!*X=ovZ0FvI*|z$5_~EGYzxm z=lR*4?Zvm3A=O7*?nUn&mKlJ}gWta*ue_hPCeCT<5<9Y-R0FK9d(?m0tba}yE`%@Z zIp5OD7W`!Bw0jiU4E3@^Bg_`ToHhIn+Kv!;U*i-G{rrzoe;<8h)2};buG!}e!IL~? z1M<~Vq%zLwnrr4s@1+5`;=}o^@Vqo|3v@b-f&E8-L?;jdauil3n2pvUFkS+^KAXnt zVw@?_(Br%N@>urcuf5pgY9iFvn`HiFMX77oLy_l)^s;@lNCu}gkPb>)!d znd~yDs(m<#=Bdf&@m;Q0puVS`c_SfL)O3^-UtqUz&Ugz^CRepC8M@iB58QAjl<^t9 zO1|N!sInB>`ZE~|{I>2H#^w~QfCVnJiQ{0<)^N&eT+)q6bS4K270nmtV45yWQfFow z)lWh)sEFCzwX?gbNA*sch2skjDXUvJgZ_P1jv=v;I0Xy0I9BA_y$iRl4dnMtN9(Sy z#}~>~AjZXsEouKZM!1%8XgHkWwW-AGS!A7&9cZ`?Nd1MIKZ1!t2{hVS?!A^I^@qk zY@Z(nktpDu8%DlaF&|=daS&E!?PUm$)JmB8pv4YY>x~(41+0DlZmE0wHi_3A#jhFb zMT~p2(gcdW>lf7JH}!c6d@H%vXXAs5jeMx%OQ0BGRAq7&U^v8mH%q%$@`G6v6q5U* zXn6jyqtpYXi!Qw8)s@lvFeqQh^6&Xb8h8rn3WL$~Ugz&!0O*W`+}^y)d4Ac{D-gBh*geqw9l}^VebzPy&PI#`;dzpxwUDMKYJX_@qDL3JBFX<%-mxD2|X~my5{NTm7KB+SeZ(lh1}a9_oBvXQ^H5e z(FUa8BSlSSw>bluO&hQtx#}2SiPFjb{h-1G1c_3yfd@(|G5&s6k&Fz9QWZ~OY6AcG z(rFYSQ7TA&edX&vXcg#?hZzja;@f0@yX61hl0i2F)77ycy2rhj_k?2@oIHao&lHQ< z&rmeFQcCNK^WE>4ivHfGjTn0{MFuWNOkBGuhdE48HG~e=A@Mh6B!6>{>d=A(CaJ9@d%NXFl9*r8h5vZXSleA9G_`Rldzl;N$ELE^d_$Tf>Z z0`>KPCi$_S=L{aSH5**Gx*id|idH0>OY`Z{;Bp0zJ*I3$4-bar(cA{`L(b6X8^ebj z{56O#w$UQ$Zq@y&hYpdh)do+@2Y| zl<&E;p^GkM3{{It8Jcmn1}56WjO+~oXbxojb$h?8Pw(hM+aq00uk4rx?#F9mKD6XemwC*0a*@^Uo|oaMxGjqd`> zN9A#0I`MLQL|0Aq%wh@H2Md~+G!xowf&}>7kZBdd*it&=Z34BwpPX*HrFEh#K&BpT z7QMB<$P~xGc`=gaH8>{sU9egev8c803!DStWG>tJN;n*gg%dUVVLHl&!D0fv_aYYW zgwHvs8ETAv@r~4R`k}onDV7x>Na$qr3m27Q#=8_3)fg##6-?j~CKlD#ieUuy@Gvko zkc)Y=41bbH*}zCcr-3yvge<`^ShB#*WR+3-Q~frtSfbtMWX`7fEHt}pj%~Jl&!?x+ zNFsRy1AQHH1D}Qy97Z*^Gz#M7NNA%p3;XefCet-+bK5LvVI>whKASgk0r;4}gS%+& z`CmW7WI;Uz$Oy}8Zto1Glu~L+;(W$f9fogx(6mvRT(EfF=4p&r0Ik@g2;)welbS2f z-5cZWn1W{>&pip|{LJBScO*-+c_J>d@b#T(mZAuXRR>EuslWxaL6|^a=o#8}`XPjr za0x6rqWdp&{cw;cGmkWlm67z+uO-E-AunG7urELJn^(CbW2=S2%JRw9tHsyKZXzK_ zFump$SixjvGr|lUm5sqmjiq&k-g{H*DEl?RgN_6Vfw({o8x9%*sm_P3N;p`;I6a=X zZ$)HTqAs5V5I}2B{xvK`jTsDp=#q?Qi%Jsb_hTCSi3@9Ke$JYNreB&TrIf)=VdK*t zcN*i6=Tl%Z0W+*IF$b{8iaj6hk@DNto7&;hkEB@jCO*l-`ZQbg)(2Br#BHN*Q|!KY zdH3P@QEJkNFw;2QpYdh1eAgI;W%KV@mL14?mJJJsK|-JxjlpEffeH_cfUQ!Jno0&3 z5kigP`_WM4Ey~%;a)-}3gqf1#ms;7V(6J-^cqL%--~(1hwlV_~I? zn)HX1DmL>NM~h-L_qMQ7i7|7XK}f!(|!sK^yX>dPrZ>Ad&nE49JKz z;o9bJvp+SNk2M-r%>4;5Qb|EJK{;*&oX9^Be_KpQ#Gh+9!%pf?#9#e)#9z~0)%4#H zf12MBe+U`v{~Z~&`w6qb7WW)cCbr=bo^M~RCdNWv994!{YS_n`?PB#X)=CSM-yur` z`_{1$5z0lJ@jyp@C8Rc4!A%syt71;mm-byENn{;nt$J6PgU3Qx!?-$`3INYy;=^U4 zLmhN};(>=4vvn0TP7sVa5igspa!b6lOGdCR4};CzMzD#F{q>aQ@~Fub#aDO|oz8CkYQDHWHYE(^g>=Gq%KA0TqIR=Vcu* znJQr`uQA!2B8rs9SB6+#&Rhiamy_W&215iRid}3L9uLAWjIR9mYSVv@MUy8Y#MV>d zcHPqEQ@>(tSF4%gBATKb{YH3;`` z+vaLc>{~PHa!_GJq_n_#?&{*lYcP6w3#L(sm^z1&7(|)nlfO;L-{hNJiht^O{b~_? z(0hRMQx7w!F1o<`kSJ$8f+mtxaC*4~x>Tgh%-1Ft-bI{vKW=wL}mVgh+eWP#bA$DwSN zki>*6w?p;SKO2B-I3zIv(-x5d{*URTK?6xlpl@NBP585q{2!4CR%GG*+`EGJuPt2r zc4rWl37@1`E8U7P>x^f)ls6<+12T;;e(O0$Vsi%tmx-;`S6YUT)9o(hmI5_SX(=+Y zV8J9@htU9AR~@n{VwZ@1&!RDM*DVdcNrn$Mep5Uidwr114;ySLioXERe*@5bY*k|* zvcXZh7q%r7@HPfjaSFsP{QL_5_OE%E*Lq$9-e`&Xz%Mn$b6eoyj%6gm%der|*O$TO zYN_aplcqDG*&Rh1ypl>$|Mww-YpbWY@(b+B;B31|Z$3IkK*%^SmU!-NLXn$=cx!|C zBOL^JoLVx+rE3^J3+jQkQ=F%9lt>0j`n-*AQoIx1(dB5Q{@A8EP^`r%<*CTsi@#`R zFWw%GK|RGxfrhdHK{GKIpLa&8rqzc7>Dc6Ey zOLZKv&Z%Z}0AysuKT426&&O8ic_DD~LB#338Uu{%Bxl{d6~tJ7e^d`UQffejMH!#? zbL>^}Ouh*fQMzAVut@k#7u)|e?+pBat4skE8Od7D)))iK>wj-dQKHTQ57x5!RD*Vn zSz|ezZvJlW>V}grRoBHs;n8F3p#8sOp&795-)51RdQf@LT9=k!lcl+!H{l@GMf#xL zWFS|+&ZgG!9vow*csCY>|6j7uj47NzjIt8N?1FAt)5vmg^Fl_dZq6zzLR-h}5G$R% z87^WD^XX(j6UmQF_%U1Uw}GEYMI>pd?87~1PZm)Gi2xWZdHZed*coxyMiSvRw5m)h z+M~$G;Ut|Q&k}iO)eUE92HK5htZq)T94F)q-f&5KEB1Y25ywhno|CINqfuN)L~`g+ zbky5`(A`$?X#&|tdm^#%ZfDYF2RB1~E{8kfDo@c)mqJzaf$NLYSVT^kMgahr`F{Uj z0JjxA;}hu}S;5UA@`bqd(=+q3@}ko;)sSlqT7Gy~`6yCdvk&CnVX_KurT&FE+oHFK zr98zWR>lX;0VNk|L1d%Azthp=d?Mh+MP1kQNdo9x8)7_H`yPvK=cILQ1j?)~!tYk) ztmOq{Aci-mfD#3!krEw!IEEp(yqvzW;$fm(tq^v=C#KG!UR4&En)d5vQ>a#gv7=#v z&pxl*u;141*2o&okRHDVMfIqp7%3(Y8lq)5|AUmX|C7u$ z4)^S)?9r)T3|2?npq@fptL9gfoY%2jrguG)J;Q^Fr6Uz-b34kYk2&?XO8MN3L>`6n zKc#q&p}DZYlU&GyAn}N>HxZ&bmTz*LWxnVRT#d~88WJpS2Aq86=hJ(a_?G93*U)6F zH$tY+@;jAa4PI1x;l|yzA>x*bSJf-JHXU~`?#E2Rnvt|B8L%|XPo{4b$2Q3=PbIB| zS9bia$7HCqKHH7Dt1c|N!xwj9;6KeDV+nk*d!%EiIsjVFxu@sFLiOu%?#Cgu-`hD4bQ0bhRdzB-0|@tYtpE2OE-A*f+!YoY{(z zHHNM((#ij;owMv}>)rY_?(Pzbl;T!eytq5X9RdYfG`P07YjKKGq(E?Y_X36D9<+F% zSkY(ozxRIjIOo;*04F1FGDdXezUNBj@0yqLB#blly?*JrWQCvKV$jqqdljw zJ*UYZ#F(XVtZ-z^XzCE@c%Y*+S1k-u+<|vnS;G3S)ww(Um_2DR(Z!PPS~KGz2_X)+ z>Qbk98JsKOn}J<94?`JkX5q~j`!oZXtMdRJUD6j8s4&AAe$J)dGSu@bXFcw;>g*QF^Qj-M7+Bk1_< zRDn%!ZpjYXrDc;W^xfV3I$+P(K4?0cJ4eYN$;5+L;uf`wY-8p{$FJ$q8NTCoX2E5Y zT^&aH>Xa5sw_zj1iPeMcS%tle)k#v@*cstl6*j0!A6hN4^xn*foy7Da6CJ`r` z1Y?7}R*;3|M>P{5H#h!%BUHB?<$`#h-Fi;#co<7g;_`B2?EK(fIpc8a9MA2y|KNJf ztK7n*2JwI!A$N`{F2S_NhuEXzUt>V5E;q?hKGBlZvM+Z<%QPbMTd8F^j2^!8JvIS( zZ@LQrKRXt$)$DguMV(!bI979y+kG!HOn=2vK$@;VNt#~5y$%}ob7bGUaIi8n&2rjr z-PFTA+vQ}JhP7wcq4(w|oiZ}piO-7Fq9@XVeWu51?MKartDn!~N&zuPOLc z7V0a@KcoPhE6$;X$fXS`d}=Y+3M`~#Y=T`X(fslG{eiTgEZ@z-?@rt}dVhhwzX;vH zKL}m5DpyhlE`~SpskjmL{7X*p6GFJ3skepvx_sNMcyJ64_mAi1WeqLJrcT65emmCW zVUM>ed*TEGi%YaUaxz>4^uj{8&!6EiaDKOc_TdiMzt~Tygk}{vM_$vbPVLYGr2y;e z6VP&Ry%i1a!*dzi{5T#FA zMpb7Ct5=s|9SFMRF({br9-bj|`xq%0-$_&WUThipzvMkf#)vZX)^WCyBkwAK7kXZO z>pLV<0c0IC z(w_n1C?8&pm^sD)etBMP_mb^g`UVFln+7KK%Xq1&G1W1p;RQ%h+12;WQZ?=R$Z|Yf zclT=4og3O7P0n18wIY=krX4tluv6s(XHQ3x#+r0nLJZwq>Tu8yv|u2!XfH8F09eR= zhb_L7gN_QDlTSri(M)2hp?XQRr8t(xy^=QVhU}o>53sd1w$J-Vk)eyL zqih{v%8ciIMKwc-dfg8@Ej@M<7jmX6M>y1` zP$bfJ;gR6#DbH?3o@u_)k?{DlSNa(b4ULkqre+FiuEI6!QSE&wzW77=p&l^RBCO*? z_x9$+Hf`&dhLqnSsHB!d4K{b9JLhC4(hQ7ny&ozVVb+>l2-Ml@5aS>! z;V%SLVK21t&^}Hq=kpEvX?W*L!xcAG`0A&^*yD32ZwLv`PC^iZo|ha=3^6aC*n0`E zFtg;~*T-^SDJjEPDkcWCGK|R3O(@H;;KqIM;FoUy!t)Si{|`J*Sur@4IITHF*DcPf z!AoJMHr??3>}LK?#vP`k4q4I z>jYdFUFz*Y6U#v(Eb)YS9o&wOyX8qw&6fwQ3*jktU*7uddz^?(h_`HTqpW%DV1nE! zcli;Jh4hy5wu6W@EGCLS#+ql}=V$;Q{o_!DK;wFr_vKoZvF#TT;1KMFYGZtm>6N5y zC&QZ6^)FRdJJ5Otu+#GEia;>sMPzO&XMQmp>ChE!?0EX>On1WZdn3?y?|z~M)A#j9 zLBWMC>^9`$XF!n#erixnS(LV9SSj_ey1A`{BKvD$t8Y1l+!5S|rjRCKL#oADc7f*2 z9?Z$gfzT~)<=jj8Rc4oy-Je{qyw|K4yJcX$W9oaV@3LOQV0(&rA5J+&K?jVfZ2SAG z!O38vE3fcagtq;IcaFD_I$D+M!J=J{%r+} zMXLvGI>x)l8(A*~b4zUJWAxA2=}yg_g(wZUWF+4o+0`1UDfn&P-$+pif{SRW8*F7d zVk>ahT+@lHw$BfiTplm9z_L`EXl)mRezc%26aQzh#To5KD_JaG>{Rg-@WqAw(poeO zB?Ni57@XJ-M~G*pus|zp<|V$Yt#cWIBe)<~TfXW2%Nu-n-`H8%m=BS7HP?QdZZ2#7 zU5Q+GrGekmt*KlOOg+d5WGH0{LxJTTNH#1a0jlDXnK272N9)A{W89Itj+TUYKAW?f zk<+i3`7U0*jn$hu+fKp8YcUyJ7!K<{;xJ7}|L$6Y*{_L^zQGeV*biHI;KzM}u8SUp zmdc=n#T6vn%u#cb=~!3i{IQCJRNHBb9zBPJEyZ<>)-1qppGfhOb*%hf(HFQ*z-^X{g5{hP-dPihs(D(8_}L-LsNtoYry*05lmu`*Y}o%zF`&DKFZ ziY*BCwXJ-CGH_cpIeJFxvnaNo6&y!kfn7)0@ClD5+^2i{i#GZ8moOy%h<&GtAwN&> z>-x3uWxu{ql4W`Whu{jq`sHBEQ!pHT}sLoRU~XTp8L;86K{(rBLj ztzk32yub|I!hiwVBMa^n3gxm|8fb^PNRLU(BY?u!IOOwIJLVAWcN~G|pEZ8IfpXUg zdQ8zzItbX*^7XD#OAZkr9&69(pz$O=$A3;H{NB-9iaq|Vh-24?dctYt zH`WEW%hJ8JXCXFzkKlCl@R0GoEJuMS8~Trx>vzc5+J9iEZgzM|O)7jn*yVB|cN=P* z@qLNjFMg+)269U!!Cj9qj9&AGVj;!o2c;Q$k)?I_rZ9?~n!7uo+E;j_I;QS6Qhe@a zh-$kZw9tEWeMW`H|)Q1%j=p`xFoW)UBDzdua0LU_)DXa#)^#zB7zFa_cx@b>3Y`@K{dk~FUZha%ovX1}kXMiS> z5v%U=j)jN*#rj)y?L4^^&a%@#H;C@*d3pu{x}S_k_1C#?W&xx)YJ4SRUzX#sIpOiS zuUlwT=U8>WF;!4EKI_&<`?4>3l3p7%JDFyX@B( z;paWz(~T_}WmggPwqaa7CVCvGJgBcf=yo!Q>TY0O=L7N`7bHQxF~OO4gM>qS^wmKYs-VNV%R%KE zT>|CTd9#~8(V)Njks-%xx)efr5Q*A6Ee1wir#EwsAKl(R1P5Xg`cVeDZCF3mDmpvj ze2ioI81Tm>{gP)3qnt)8Hn@cpt?&HHtv4|jBZ_E!Lf6;%)V|mk&yR{4{Tq;q95(H} zF8@I$ndNxA+C5iuvJI%B4K&|>&}utrGje;jM6KPLRokrAsV;$4LTb`)BcecQi1fIl z$LP?0e$%XLS3^capAR$mvPeJv#6K7F1?|Y*J0RJNkiuNxznElt)j$~WD5_veE6~+o z+9=5zbUv-v-NmGs0RzOC0i@ilj#o-_lPJlmC^5Q#eddaDm<*08M zaasZYgUjo z_TKubn`$^;>z|Jc7}LhEZ_HtxCAz0n6H$my!qU4{6NW#Fm~%Jd)<2; z;kL9P7MGu7$&LnxVdTp4;}ot?)t)FGc31Ro#tTs-b!E_#B4Z+S_S5My&_{iks|-g( zPYH_s<|wMhMDpoRZ7=SR?V0Ryz_@IfNm_J4t~8n*ahnhML+UXMw^%H_mQ*BDj56@p z$~s31Yji103PF;&Pe3w)UV4?T2`}7@t2`mJNi4MJ0Dm*gs z^I++5&+1mq2a#uup!5}IkxrJL7D;9;J#UABJJI&MSN_B^XUqE2jPJ*f(ZYn0)I#kvpy?o378O;ixZCWC%oyw-IFu5c=OK1 zPb03%;Yrcp@M!?8$UTsXfxr|PTBR6!2d|}tVm;D8!zie+$qtrIpKj-?*4vZXuH!SS zU?K{ffy+|!7X>Kvkl)=DJR72P z@pa_aHROAoCIP-f)ues@v3DWC0Ug(mSm@2{d(0c!*!&zr&$a_6R=>WHm)H;E4OESG zo~CGA8kkr~zy>Ccg7^uDYcPyR%;#DC)cs$$VlPuy%aS(kEPS+aG&NvQ!)ViUv2?l@ z5j-OC-Og5jnp*DUYdjm~=q$^kYrO65)GyKjOglA#sXHThUG$zHzTT7X6|Q>3q{)s& z4DGGXKY3DFVLEd{U;N67)J(%0jA4~iUI*BtT35^*=Z#LmTiWn(TlCx^p3*2z(?OYv zeps9$Dsznk`4z`tDz*JGxH_=!Bg9WrY3hR?L3A*6KIGybP84rnBBT3+a41nM?wq1D zAurP*BG(cfMC8WmitKUeRs1`5fgQ&eUzi^>RsbH-z+;kxNqD7XDoW655#dQ)n#Pjg z)dxy_Ix3=b2$o*z1Z+gc)}rNRZ`3~ahbagSysdvb@pFyRAbPp1Wg2;e>Ri~u_Ea28$#(mky$sRz->U=)wEah;rK9+ z+hi90!rj#EVRuFNsgMbTQ8l4jswe{#1pseWCB=j54674wehDpIpTQ9Mv%$h`?-bq_ zLW1Yg7gr$YwdryD%!3?QI+IM)yK}U7_R(XeQjm=Z?;~)y!yAY$!LC)FX#dXfP|2V8 z`UgeWJ3$#3o|C0sufS0QX5Y~JV{InAj zOOa*{a;4w0Ihv2o$BC2bTrjuNQAkh?5fO@6#lzU+{Y^aolaq~QbyUvFeXAJEq1B6> z{dI&{i=^6jQ69DphSmnTKGrO%sZJ$mc%Qm=(TCn7k%9whsQdPs5AA)=c z-2=Npmz1B3cZ}D!@icaIkM7+7479qTk!1Ck5tKTA=-zX!XHWe^R2^h8jkzq=qzq5E zcVWXkh5fXX!%4s%IWaBJehUi*FZ;x=JC{F(NI@^&WIl54>Nsn@EVif#We<{U5=^OO zDmlW#Q*W$;NK-IIci*FsUTyRgBI8X`6WohqXYi@8Qe8#ep(RB@=-RIx*hG=i%OF2W zu5BwI0bOE$SOsXw-o2SVL8oBOMcKiCGUb<9_~8YX0BQDObN?!ZEb z;BaFmxA@{&8qji7FRC1bVlf$4`_VK#Dw!-pTHTzD*ARD52m?ffaD1a5ybp2Hjb1(v zPGPugdi15E^mDtz#8ztJ{r9;dBLdwrU$owQ z&mS>ep}{Ev?4WAP>#NMFH?wC=Cy$faDi1hyZ11(8i@obVOU-Ts8_O%%`!K)Z_QJ$5 z84%P3XC#9IAryoyGbo|Ruvxg!JZm~8HhQ+1!?k$3D)Y=!WZ^CfhsvmhE-60_#hb-R zv&D2Q^p+ZILP&v*bamoW;Y3^a$UB87(Qu@44d;96*F=1n=LNj_(NV16lQ-m#oGFew z#E#l%uL781qy)TA+I4r&IIRmi(){bw1Ae(FD2c-EHXQ^v@$kJKR9fhI19R!RgoQEC zE1%bdH=Sv{Dm0>nb(hfdwTHd?#smyA8othY%VvJM!#vDghu4utYog!8MTUo{-2JRk z+vxY{V!Zw4N_4L1aO~x_TD>h4L0`wvNNZqifNf@J$v;UagHD_#U9OqO&;P3jz+z;^ z!-1Ymr^H5^PO1_RXz4SIYYDok?E<`#_qW_{CKET7i?MZw7 z_sJ3e!W{@QNjSzQK-QH zM2YO%I_s}TDx-ZO(N)WnBdc|r>v2_jC5UT zrbWeCz%`X$_l|dJYPIQ%0045N6q|UgdFwRhADQOd(ZeRj6KUpfBQ|Oxs{gWGiI9z- z1S_gu?h>{UP1dR=JMf4X!X#>6-u`18v3lOl&>)MMly2JO4@gx$PYd|f0*var8$ar? z&lm!7ax4G|DfL-90}rMXDH~ftPlA`Vbg=cqL%-jGZ-*z<=v!#H1BNYQHWIPv*%HJ5 z&87DD6-kfE{L2yo%M)U!ks>m6$fw-&@@Z6&O`-g=qu92!%eYN5*O;t{f&WwZ9}v}@ zIrN+*IS|fKR|DXac zEY8kAp6u*y0`(nGA^{XEbqjv9QoJk{}wtoO(@(OmV%bv|;9BjA`>gM7LG z=cI3_E#-*cCrpj#?U`nR<<-t(?kIYlD%WC?#NDMvA`t`@Gvt}V-j`{VFS$OdTJ#fNM-xFWl>4>ChnQ7NYwk9?W+`ffB(}us`Ryi@SzHeee2V$4!Gx2Fv5tBiJv+UF)cA@R>!YwlYoQ#+d4#Hn-Qj z@Z5Y{dV(wF0QtM@ib>Z2&OO-P%d(e@zk6`jrUc*2le^<)fbj56kURV>3jLzdq}&sH zll z2nm$GJ0b4cnhSnw2bhM(09C#E<*nnEjScL#%%oGN?-L{SVbYTbyer)QZ6*1A9bnnG zxm_cqFF$u3)g2cS+>2bF1Sv>Cx0hSA(C&XMfB4fMXI5|mm;9hNu@rBMcm9Whz9o{H zT145?2~ecPFMSby^@>uQa0dj~)gEAYX5DCY7eyutKY7HFJ(>Hoqg?2bg#VUz8#j_E z6wLHj{9(}9&3%!4x&FtLt0i>lgeqXr6tKnv_UBUQw?F5rmT<+`$xJbOn%9Z(pS-roe^}?o(-hAifaMdSzF3a4Lg~qn^HO_gMzlGjPpqZ0JE!v66VO8TPB|<3LiC!B1 zho9r89poOVBs!btriN#&>T2pfEq86};u^ZfLZZ7ifYjKaJj}rGq1--ug_!$QlIW9~ zxEf5q>HKxn>+O79_C_`P)q(lNRmahzH3f$$Ps@_=L;Qx)^=}x3IJu@TVauIuDCp%4 z-DWMOTe*kZn_vNFpZTK0q5OpB%nk04My^qAdFTN53AF5+!TrzR$9%3+l{GZl_l*A% zV(Y!TIG!eM;-1$d=o0@F^YPQLx@*pY@M-xJp4!~KOtP!-`DP`YKgX#u=Ip}t-B;?3 zs=&4+L|YjH0>qENSuY=7G|J8v1xg_xty`)?zFvtO#3O7bbck?%EXv_GGP%`$Pb2;H z&BQa6etXnizC3il`(>FGpWTYJ&&M5$n-zX`{yv(p#9UgL6?$UBp`2Eu1f#}>@Ga$^3k9#T(nQLp9o)yYCBc)$9G=+wMIJmOh){;;g*An3N9} zU-AI_u+Hx5x0kXiIv3Q7NL2W3a_+yET;bkUiyJ4wQZ`)A-hshnYjfDJhN@T=e>eI8vaNS8zx zm7UB2g}d5L)UrnBWdpjyS5gaySkZz@1)mQEiVj;-WhpcI_=$6dKkfo zp>#n7WYD~7XaI=4dg+PRLuBCA?g=x)@AtT8`?YV<_vPyjCzh0L9NcSKvBvd{ z<{#tiF5Kh?oWUEF$1$tOAGx-5Bz6SEpFMZ?>|Gp*Pa=N3Ny2E3k0=bf`eA9-K=_MI zD!Y{E(=Y#YUevC0!VE$G-PT3rAC+ceHtQ)C!8Q4)h>`5%P8%s4@uCv#sm05_0=ZDW z*Sd$)kjONywK)vgZEI5rGTFI_cI386hV1}y9t*S)^`cH;`$Jd^LvE$1&in!6F#$XQ z%S-*aaX#in3RlsqRAnBLAWF%ePr>rG*Z!VprFmBVIzwt0Ndj$V2Gq~;qEB->*u@X; z?n4SGXw96AI!@-v4QL=SXaiw+lH3(?#M`dCdzHTQ?tw{fefzR111N6($_|??Bp1 zy(pf=mvQnw$d8-!7w2<*tO&6FXlie?Yq-tHut>6L`ob4<+w!y-B6j&yE=^b@h3kyW zs5YPW+&mTVUAF2s)U<}&=4oH-xqEjU$4otMpxj&QshycF#QE?+acxa<2hTeTREQn^ z0`U&>2dp}mUS0zqF*X8Dvb)gfiucibwe)!roU~ooQ;JQ__4jcnp`^k`!Q{6nh?OaG zF{%{$XV<+4CAL`j@-xAFDdV5CE&|W>JP38=<3MIddpptq1C8oIF`?-c%mH=q# z^G#ZT#^9KsetFL(s;0*0LIYF7V5L8SMNwbM203%Lsn`ypf1mpB?=P#X=e~N~v2k4A z?!A70>(kJA)^xP#jEPK0J-q1oZ0kt>S|mh<5t&`^n`B&Q%2*KU=Tz&T-M`Fb40y6D zOuvN`>$~>Y$QI%Xsso``14?Lw{Xd zeyA(hX|lIbS@yw#fjAnv)5a`Uc+BiQKcs@7eguS1$e~ zL(JzqAlMg6gLS>7?3h;2?ks6pOjP*tEZJ;&q6gR$Nr?qz`*q&+j!PP{LQbK>H`)cr_5*{Yu=Bv zDo(cZDu_iK$B3VDrc<^#7`P0^pgKn!Dg+B&#GpsEX@iu~gWEue@$yd^@nidPfr@!F zC@KXt8us>tm&d>C{L_k!IQ6?EYMJ?=40{Nf+&oOZn3oMrP+m^xZZB`(48`R+FP9xo z?4x8S%f?uK=cnun;Z35XO_5xTkcG9{vK|bt>*t((n(;R%{4M5s5k+!r8`OCakpu!Z zlQk`FZ7OP`*A+$|pskjosMs%D)`!b_q`4hzLg1#^q8%Mi%Q3%yNG`2*E8oaQqUQ9* zWTRsy1jq#7q35CyRWm`Pw^O1z<{H|W0_B;0sJDYy9-Bsg51T9|plB-xD6^IEagy2z zAhxslL^OMN+QOg=LB(Yd_@xWn+aHn|(DqK$r0b+tm;OpXfa?;v0T1ucevL zSjBEt42a@@Dvm1OvdTtW^GBVdDkA^-7G{xVcEo=E4F@lb#{}1T<^Uh@1O%Em^i7y* z6LWxH9Pj7Zw?c)E@#|Iybjq)_Rd$r3#wcQfJWzlNO(U7@+Dd|WWK>Erbg|j2__{fH zgJg~p1>S*s6mZNc=d2#}3gU{V$0QNnmr^0Wn60o8$1@wXDg1VYZuCn?4`a+X=gs>p zB~}!;JMO(6|9virUkhlRXkfnMrUSQVA&9et~NwNd2lKek6C6LPKB*LZt{~i8cjsqNja4~_Umq17p0k|m2s>xJJnFsw3 DQr_yr literal 0 HcmV?d00001 diff --git a/docs/examples/op_fuser/op_fuser.rst b/docs/examples/op_fuser/op_fuser.rst new file mode 100644 index 0000000000..9613ba74b3 --- /dev/null +++ b/docs/examples/op_fuser/op_fuser.rst @@ -0,0 +1,353 @@ +.. + Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + + See LICENSE for license information. + +Operation fuser API +=================== + +Motivation +---------- + +Transformer Engine relies heavily on operation fusion to achieve high +performance. A typical training workload involves many memory-bound +operations such as activation functions and normalization, so +replacing them with fused kernels can deliver a significant +performance benefit. This is especially true for low-precision +training (e.g. FP8 and FP4) because it involves extra cast operations. + +Managing these fusions can be challenging because they differ based on +operation types, communication patterns, data types, and GPU +architectures. The most straightforward solution is to provide +monolithic modules like ``Linear``, ``LayerNormLinear``, or +``TransformerLayer``. These conform to the interface of a standard +PyTorch module, but can perform arbitrary fusions internally. These +hand-tuned implementations can achieve maximum performance, but they +tend to be complicated and difficult to modify. + +As an alternative to this "top-down" design, TE exposes a "bottom-up" +operation-based API. The user constructs individual operations and +passes them into a fuser, resulting in the same fused kernels as the +monolithic modules. This approach is more flexible, making it easier +to support new model architectures or to experiment with fusions. + +Basic usage +----------- + +Sequential operations +^^^^^^^^^^^^^^^^^^^^^ + +At the most basic level, the operation fuser API involves two classes +in the ``transformer_engine.pytorch.ops`` submodule: + +- ``FusibleOperation``: An abstract base class for tensor operations. + Examples include ``Linear``, ``LayerNorm``, and ``AllReduce``. It is + a subclass of ``torch.nn.Module``, so it can hold trainable + parameters and can be called to perform the operation's forward + pass. +- ``Sequential``: A container of modules in sequential order. Its + interface is very similar to ``torch.nn.Sequential``. If it contains + any ``FusibleOperation`` s, then it may attempt to fuse them in the + forward and backward passes. + +Thus, using the operation fuser simply involves constructing +``FusibleOperation`` s and passing them into a ``Sequential``. + +.. code-block:: python + + import torch + import transformer_engine.pytorch as te + + # Options + hidden_size = 4096 + ffn_size = 28672 + batch_size = 16384 + + # Construct operations and fuse + mlp = te.ops.Sequential( + te.ops.LayerNorm(hidden_size), + te.ops.Linear(hidden_size, ffn_size), + te.ops.SwiGLU(), + te.ops.Linear(ffn_size // 2, hidden_size), + ) + + # Forward pass + x = torch.randn(batch_size, hidden_size, device="cuda") + y = mlp(x) + +.. figure:: ./layernorm_mlp.png + :align: center + + Operations that match ``LayerNormMLP`` module. Note that different + fusions have been applied in the forward and backward passes. + +Quantization +^^^^^^^^^^^^ + +The operation fuser respects TE's APIs for low-precision ("quantized") +data formats like FP8 and FP4. Constructing operations within a +``quantized_model_init`` context will enable quantized weights and +performing the forward pass within an ``autocast`` context will enable +quantized compute. + +.. code-block:: python + + import torch + import transformer_engine.pytorch as te + + # Construct layer with quantized weights + with te.quantized_model_init(): + fc1 = te.ops.Sequential( + te.ops.LayerNorm(4096), + te.ops.Linear(4096, 28672), + ) + + # Forward pass within autocast context + x = torch.randn(16384, 4096, device="cuda") + with te.autocast(): + y = fc1(x) + + # Backward pass outside of autocast context + y.sum().backward() + +Branching operations +^^^^^^^^^^^^^^^^^^^^ + +The operation fuser supports very limited branching behavior. While +the operations must be in sequential order, some operations can accept +extra inputs or produce extra outputs. For example, ``AddExtraInput`` +will add an extra input tensor to the intermediate tensor and +``MakeExtraOutput`` will return the intermediate tensor as an extra +output. When calling a ``Sequential`` that contains any of these +branching operations, the extra inputs should be passed in as +arguments and the extra outputs will be returned. + +.. code-block:: python + + import torch + import transformer_engine.pytorch as te + + # Construct MLP with residual connection + fc1 = te.ops.Sequential( + te.ops.LayerNorm(4096), + te.ops.MakeExtraOutput(), # Output residual + te.ops.Linear(4096, 28672), + te.ops.SwiGLU(), + ) + fc2 = te.ops.Sequential( + te.ops.Linear(14336, 4096), + te.ops.AddExtraInput(), # Add residual + ) + + # Forward pass + x = torch.randn(16384, 4096, device="cuda") + y, residual = fc1(x) + y = fc2(y, residual) + +.. figure:: ./residual_layernorm_mlp.png + :align: center + + Operations for an MLP block with a residual connection. Note that + the block has been split into two sections, each with one branching + operation. + +Developer guide +--------------- + +Infrastructure +^^^^^^^^^^^^^^ + +In addition to ``FusibleOperation`` and ``Sequential``, the fuser +infrastructure relies on the following classes: + +- ``BasicOperation``: The most basic type of ``FusibleOperation``. + Examples include ``BasicLinear``, ``Bias``, and ``ReLU``. It holds + parameters and state, and it implements both a forward and backward + pass. The ``op_forward`` and ``op_backward`` functions have an + interface reminiscent of ``torch.autograd.Function``, e.g. they + accept a context object that caches state from the forward pass to + the backward pass. +- ``FusedOperation``: A ``FusibleOperation`` that can replace one or + more ``BasicOperation`` s. Examples include + ``ForwardLinearBiasActivation`` and ``BackwardActivationBias``. Its + forward and backward passes (the ``fuser_forward`` and + ``fuser_backward`` functions) must produce equivalent results as its + corresponding ``BasicOperation`` s. This also means that the + ``FusedOperation`` is stateless since it can access parameters and + state from the ``BasicOperation`` s. Note that different fusions may + be applied in the forward and backward pass, so a ``FusedOperation`` + may be missing its forward and/or backward implementation. +- ``OperationFuser``: This is the class that manages the operation + fusions. It launches the forward and backward passes within a + ``torch.autograd.Function``. It can also replace operations with + equivalent ``FusedOperation`` s. + +The first time that a ``Sequential`` is called, it will group adjacent +``FusibleOperation`` s together into ``OperationFuser`` s. The first +time an ``OperationFuser`` is called, it will attempt to fuse +operations for the forward pass and backward pass. Subsequent calls +will reuse the same state unless it has been invalidated, e.g. by +changing the quantization recipe. + +Quantization +^^^^^^^^^^^^ + +Each operation that supports quantized compute holds one or more +``Quantizer`` s, which are builder classes for converting +high-precision tensors (e.g. in FP32 or BF16) to quantized tensors. In +order to enable fused quantization kernels, operations can access the +quantizers of neighboring operations and quantize eagerly. + +.. figure:: ./fp8_layernorm_linear.png + :align: center + + Operations that match ``LayerNormLinear`` module with FP8 + quantization. + +In some situations, like when operations are split across multiple +``Sequential`` s, it may be helpful to encourage the fuser by manually +adding ``Quantize`` operations. + +.. code-block:: python + + import torch + import transformer_engine.pytorch as te + + # Construct layer with quantized weights + with te.quantized_model_init(): + norm = te.ops.Sequential( + te.ops.LayerNorm(4096), + te.ops.Quantize(), + ) + fc1 = te.ops.Sequential( + te.ops.Linear(4096, 28672), + ) + + # Forward pass + x = torch.randn(16384, 4096, device="cuda") + with te.autocast(): + y = norm(x) # y is a QuantizedTensor + z = fc1(y) + +.. warning:: + + This is an expert technique. Quantizer configurations can be quite + complicated, so the ``Quantize`` operation's quantizers may be + suboptimal. + +Implementing new operations +--------------------------- + +Implementing a basic operation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Subclasses of ``BasicOperation`` must implement ``op_forward`` and +``op_backward``, which are reminiscent of the ``forward`` and +``backward`` methods of ``torch.autograd.Function``. They have an +argument for a context object that can be used to cache state from the +forward pass for use in the backward pass. + +.. code-block:: python + + import torch + import transformer_engine.pytorch as te + + class LearnableScale(te.ops.BasicOperation): + + def __init__(self) -> None: + super().__init__() + scale = torch.ones((), dtype=torch.float32, device="cuda") + self.register_parameter("scale", torch.nn.Parameter(scale)) + + def op_forward(self, ctx, input_: torch.Tensor, **unused) -> torch.Tensor: + out = self.scale * input_ + ctx.save_for_backward(self.scale, input_) + return out + + def op_backward( + self, + ctx, + grad_output: torch.Tensor, + ) -> tuple[torch.Tensor, Iterable[Optional[torch.Tensor]]]: + scale, input_ = ctx.saved_tensors + grad_scale = torch.inner(input_.reshape(-1), grad_output.reshape(-1)).reshape(()) + grad_input = scale * grad_output + return ( + grad_input, # Input gradient + (grad_scale,), # Param gradients + ) + +Implementing a fused operation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Subclasses of ``FusedOperation`` should declare their corresponding +``BasicOperation`` s in the constructor. They should also implement +``fuser_forward`` and ``fuser_backward``, depending on usage. These +functions are similar to ``op_forward`` and ``op_backward`` from +``BasicOperation``, but some arguments and returns are lists. For +example, instead of taking a single context object, they take a list +of context objects for all the corresponding ``BasicOperation`` s. + +.. code-block:: python + + import torch + import transformer_engine.pytorch as te + from typing import Optional + + class ForwardAxpy(te.ops.FusedOperation): + + def __init__(self, scale: te.ops.ConstantScale, add: te.ops.AddExtraInput) -> None: + super().__init__((scale, add)) # Equivalent basic ops + + def fuser_forward( + self, + basic_op_ctxs: list, + input_: torch.Tensor, + basic_op_extra_inputs: list[tuple[torch.Tensor, ...]], + **unused, + ) -> tuple[torch.Tensor, list[tuple[torch.Tensor, ...]]]: + scale_op, add_op = self.basic_ops + extra_input = basic_op_extra_inputs[1][0] # Extra input to add op + out = scale_op.scale * input_ + extra_input + scale_ctx, add_ctx = basic_op_ctxs # No state needed for backward + return ( + out, # Output + [(), ()], # Extra outputs for each basic op + ) + +.. warning:: + + Remember the contract that the fused operation must produce outputs + that are interchangeable with the corresponding basic operation + outputs. + +In order to make these fused operations useful, they should be +registered with the operation fuser. To do this, first implement a +fusion function that can replace operations with the fused operation, +and then register it with the ``register_forward_fusion`` or +``register_backward_fusion`` functions. + +.. code-block:: python + + def fuse_axpy_ops( + ops: list[te.ops.FusibleOperation], + **unused, + ) -> list[te.ops.FusibleOperation]: + """Sliding window scan to perform ForwardAxpy fusion""" + out = [] + window, ops = ops[:2], ops[2:] + while len(window) == 2: + if ( + isinstance(window[0], te.ops.ConstantScale) + and isinstance(window[1], te.ops.AddExtraInput) + ): + window = [ForwardAxpy(window[0], window[1])] + else: + out.append(window[0]) + window = window[1:] + window, ops = window + ops[:1], ops[1:] + out.extend(window + ops) + return out + + # Register fusion with operation fuser + te.ops.register_forward_fusion(fuse_axpy_ops) diff --git a/docs/examples/op_fuser/residual_layernorm_mlp.png b/docs/examples/op_fuser/residual_layernorm_mlp.png new file mode 100644 index 0000000000000000000000000000000000000000..fa95114a69b9cb7ca9eeabdf92be4800be78550c GIT binary patch literal 15620 zcmeI3c{r5+|L+kYp%ThkDTPqV5)Dd`gpWPDNOnWUHiOAt3YAdyeP^!_x*am-mlm5@tSx4Xs9rs;yOh| zMa8W8MM11Dq*0_Kt)G= zj_Sm57vPtQnv06=cbkgpDK+>1Y->~B_}d3KW~dz%?cY8|!1nm(9PkH>{<}SqLj7My zOriNtZem4kTqoTThQT5RST`%et9KDa; zY(vY&*mBU-j~}UC1Swux4u7Be=7a&V+}R0Pm^e6SpI>KBmoSU$$Bm!{jZlMyr|HFb zCM;S{!-6h8p&71RDBQc-=Y~MfOxABe8$MLh_1F4QeW?&|$xE-W9BmY3?KE=>#Cm5F ztDSth*cpp+&mslZjX*r>jocp;UncPcELM89d+l6|P;%q$uO+M!O}?p;rw%VLGR3_Y z6E?=5>CV?`}q)4=p-r1kP{609ua`iW5IEU)c`fFBX(R@7#mysq+P$w>2nV+#3aEs(1zMNRkaTmZvB&bGgE=?_s zY${hRHN|cVaf2RTU<8F?m$cq+n(6rex_9p@aqRspXs>4h>E>fr=3LW#M>Fh8BvZra z?x#~*dR{+h9cyH3!pT8sS9o9zEx1Xd`0&dBy!FDk*cLrcsgO zr&^Z(6P1WwR-CY9v@ZAm9i=GBabEiG9Q^5=%&U&^fpgWPW~ke~gU!S$f@m>*-AFG4 zD=a>5mS1z1@5UJ+f2;jv^$+Cw6a6*qcjg@D3UDr!%d7J&({6Ozx% zg@LY0a;+68XvRH<9}ar;_R}ALV!W(bmCc{pBa;T&f>mQS(Mfn9X6ZvNuC2Cjj1#c_w*OB(LRpI9r*5%P7~L29%n z^Y3gl`|{UTOoa)Ggc#P3udpT}F{s7s^FhN(ng)?P;|~n+6)$I)1+2inOktq%zYkqf<<@bwZVw-Tb^+j z;>K7iYvkggDe@nqO1GvigJko=(!7xqqBp&V4(Z)EK#om<1|c4+P8o)-yiso6Mh`vo z)Q8dX3J#dn;qdb;JbfxV2OBf;M^Q-w)g{m)PC0X^I|fYDbCieuQrMIb?Dj!Ljy_40 zRJb>Vn+g++6qUz|o@oqN%iFs=@H)@rzTnoU_nS&Z<9x#9wi7NVAyEaFKaeXHGJXz~ z2^vd@e!PI)STIvCHNFd;EyC(b3yFb*%G2!xE`RZ~e89jSnA>OaP2hI9l_SK`ZkA{v z#pUx&3A=x21b1K8d#0w$f)E!rva=em`bM7@mbG1bICF?PqC~L;4krfSz%lwO_Z?L! zVi{K<%szI;i&K)-_?!YEW)nN`z1LnphFfgSKWI2k^Pz}alp-5&z)yT7DET8G~DqVvhs6r32D|tThM()@g5$)OHn$5hc^y)R7 zZV5GrE6!Pe7qySe^LQ%ax&4HfGB+~XJZndZw(6Pb3>0zHXZOHvfG$c?Uec6@7Aj>9W^n=<%4zSc{`hA8fP#M{ zk4f%u1_*q+R73EnkDGT$P2n_uS$aC__PQJZ8?4?zk0iVO3|RQN41N2 zQw-obTYZZRKDRI3`fg>}-4v|pTe9;4^Yp_C#0SqU`AB+&{5(_{ExseyBjB`LT#Fdj zFUbKnXQY+vgn|}UIh@45ZibBUhyQpS90?vHrP8Rh1o54?qp4suk&M*-jMENDmRzVS zU^yY|KorOCZ8C}Y6|EQQsNlC%g}+-66u9_vE_>$E35R&Ey2FM*?im_MTt)33_fakc zC9WLKNH1jrXQuLlWalqWHWkbdo(H!;&}E+4t}vDY_YjD-WAlbL8fRWfSY?fL3BZ+_ zSE!dvc>Di?h6sGrnU?^* zvd7%K2RdR^AsxIk(;m6dH(Oxkh8Wa?&ClEy6~}FiwZ!x5PJRx1*hz18>p&~`916yH z8vbmaYxZ<)VUpZ-^2TkUi>j449hHoqo_jUx8+z4qIiG4is4}7CdDwX25Ja-BE+=AW z=c)7l?x$hSitw`4fUc2-XTAh=4KNt0*`}5}RysY@F+*K-D3jJ=4R!F`N01uRVU%a2 z%2FL=o_I2Mu|q8lo_B5LfY^CHlesfusa8kJN|)$)7oA5Etv#|9oF_OpegeV2#L8Pg z?22ZCnP+m>+UuROsBA6h*>hO1*mft#aF5HjiiOm%bEUinuII?jMy?h2g`-@K@ zp@(b$Ogwwzy~ON11EULA!YS0Ve7q;?XY6gB*sZO#$fW2|yRmo&FgEUmDY_6|)u>-D z>pAqp$n7B>&lgK3w8foJAs(c@68EXI%`|Q@G~J`jyhTNJR~IY!WQiT#?MpaG@L5Ze z(w;UEkreQ?KkRT8QtF>7?dIFGK!2EKuY=1FcNl9Cy6zN@1N$*c-nf0Gd&o=vnyPDdtL8T<^e53T8Sc+JLhg4~cxW!!B5rbs98~q9Es9QFJX#qIadO0O4Mdm*x%QQWb`vvWX#7C9ROu+y6b-txHS=}h`L#D@ zypbJGR^47kwm0uonZV9+MnPJC&CC6CAKX_{uFa_+P792$CJbnbC*GM!-3pOU3uf6c zoM(k?Po34g-eP@&L0P3G+F*^4B)e2Cki3>KU=aUn5mNTNi^nTgqI0z*fSEi;h%?LH z^f%1r#gkCb{QkR(uD(TDrdqKEYb#%Sw#ul{njci<;6vz8PY_s_*tGOj3bOx3VTpGB z3u6LTuUg?I<|<+||3ij+8)9F}Uon?}#9(oU9^&fj%VTKa?CDRr8^ej`U<8aNN?u~R z@L6ID-pI4s?Yanht2L|(j5wm;_#=GRqu5sWCnjiB6A{SAU{Ty*LbWyG9{SDcTDNthK!;Op=JK`6WZ^7`8Ty3A&OL`Liqqtbr}ROuQz1KNz&vwhlpj32 z50g~l&UibwW$w@@Dm~w$+uf-DeoyCCl~4IfF|a7ucfRS+*$xWjIA^K-Gv~zlO=W=$ z%M0`4;-r)uABuv4oT-7yJTvTh@pFUoabj&t9}zBdKg|=B?z&~&>97fotW+|}*2iE? z9kZkJTf~n#gD{+@Vax5xdyy@4S7cp+UJ%HWU$Kb$bEQ_$!)--OQbxsT$xOzXFF|4_ zBs6(PW{u64y&kN=p>BPdlhP*-w2snPU#(NRIItYGY!! zTSRsoO^2G}eJzly%!^@$sDv^zO^lFi4Qf(6y%gOqR2&-+Tco?6!OuPY9`kcCG&GCk zt|BN|mwztF*&i%)2!Sn1s1tKmfe2!|xg0s8V@k2@5+bFf;uDp%nVw$oSI*1#G~|6$ zIEI8{2o5&35EfXUjVw;KuYCPDkdso7>wyh;kCn>d5Z>%7d*`3w&G^nSIOLKy)#8!$!*BKuOn-N2^uO+ zB8CO9w^iKFb&Qc2>lY1F@3a{WGRxb97oD88l9fR-IDZS_@L8GOdc?d^cYRO=GVxU& z8N1k`#{*Hvk_h)_!vl=Lu5RL>qwT++NTCkB0Iz~s{60IqhYctJn(5k_myVW?= zc-{28j$8428nyt=X()k zofUa<;Z>zX+SaA#g=l4+v98*4O7&)S2~K+l*13C~ z>x<-4^Ml*_6qSYu#gM$p$9=q9BqpZ>j+-LZgpVl>>BeV)UO19CgJdg!96_r6wg z>Pno%qh0s5b*o2=Kb3XCG;&t5Q#&)Cl1{o9xpaMivi5g2Hkj6>dXvBG7FEd)!m?N0 z)(~TbT19L@Sz>^b8Gq!-f1^harzV1ykESO+YR4e0HN|RYY*hGs=rP8*0qq;U;H5LM znW1ER`+WCS`CEi9vU*nzS}eM-rgEnT&v&(%mTvzlbaq>C9;iz56x^xQ@85)0c?@@P zmdE-Xiv{`3;xI_>J(>jh$|sS-hldpZHZ!b-Nq+n@z$DUD*vw2e)YPBCZDvZLGSeLo zJPCa+gG)!Qo8R~>(Js4eVdtw{^W~VJG^CQ-Zrk$v4Rhq3Uo+XGe7%XBOaAo%RcD z&po#xc^J(2?OH!UYLHv@yp*o;*{kAKVa^IpBPd0MrPBoc0ji=rA%)A}ILicQu)D@8 zX`nAqQz*BF&%-sWPy~t6=*nwj9nn>pbkI8l==sns#G)Fx9y%{X%;Jn%Z2Y=5*$z6Fmdv(8s1{!u3wZYx| zFAwVq%yhD>)wyLAlz)*fbz`AVnikim)x`Q5JGUfuOxHHzbbIi5c^h!=T3CBcAlM&z zQ?Z+WA)r|_H@F_L7r#0f3H_?u@K|)A!JSg!Gp}}Me5rsRsjZN=fx_Z*rE9p38o0Hg z14lp9S#PY^8Jd&)d_K$>>?{n=`7-f8muVdyOezT&9Ut`1-%!C+SR0Yvm0t437~6C# zfV{RXb+8l;AG8Y=bl>oOnOj9&I5$17UlLu+*2Qm2G=TWPw8v} zWWndvxDbx*tenQKn`K$sI@8sQZr{7eCQ5n%<_cRG2-ZiplKC0ytfL3mJe>CgyRi>8 ztAG?aKz==~`@3xs#2{r0XSZFK;hm1SYo`)?SjK$SCl;fECQm;$ zjf~M-aebX+RD}1h@5GoBIlz>|(6Z+2VZr%DQ)zpGfNtokz`8?ImfYE@Xc}+sd+!Ge zImWuBx@uQrc7ByQ`&4jVH$uFdq?l6}ztvl#!b($GDz_Jj)D5lY?N!N2%^ z8E12cQf|XW0vbR6Fj3kJMYFWCSCl-qk>T>b^n-U>JuAF^T)J-g(Dd>3X9r%(_dx|^ zjxAg6g5Lt)adm~+i3!XT&Yw8i} z0H2MRSPC)A`&4Rc8$KdKssiQ0KSTOM+Gxe9^%CenBL(NZk0y;m0kZ^2iHgWrvn0p zl!uE(M?BwUn65q&rM_v8zVhILS-uLjnU^{pWHtO_RG?jV?ep;{|7k#lH!CAPN@lav z$)3Yay(>D2Zyj3r2`F|G?V}=$2#_)Iw0KPx%_q0lw^k}S+hGze+v>>!F-46?@#~MC z1%(ltX0kMGt!kq7O$b|$fBpl=(}w5FyuxWAY3~!F0=2q9Mlx5r8ZQqaGgTv=jK%_S z3n1_FH9mOPAw0slqBXP4Y?j|bW9Aje`L--dX&pqeRYja+QGflY%Z+uC2euMSyq3Tb zSL^Hud5HRwszb&NE{mdk0!Q9&XPB&LwS%itMH@#yJ}WHp7E4b{`amN&AZfSba_pcj8Kt|EylPmMt&yCkQt;fv>DIx;{AHa8x9|KuRzqP)R*L$ zYc4^zbQ6S&FJm(-<<9oZ%iaGX1^K52W9(IU$k)IGPI-gUVDSfmx!cWrZOfb%N;&`9 z7~8S5f3Ig*@Q;NS3e}574-SCAqeN3lbgF5RJQ7=$TgwHF-uXSijMF0I2SMfI;UANo zpr)n?rDr*3`sg=Ysz0tnK=41Ir|5~Bbd&})cDaZD*QPY-%qS&qRMu$aM`QbIyJ@%U;GpKYAfK+?%MshcJF}3e|lZ4;PLN? zGVz_}-XdcgkEzbo_L8f&UALL6qGmK-`Bjk9c<-f~~=rw@T3E7ng z{^fKW0H>P{>@NCyvD(Lw@AE4;1}~kIL315_%+S%pD}5enwL4_=!I9%*Bm5GHetciB zv*P6VtDmRQLVR4z_>UZCY~yJ#NcQ4hMsAYdt?CJs1F2)HO(W^6G)PL;RXf=s@leGP z1)y2m31l0$(X>2V=_nu9Gm_3h!_2f>6o>D5-=9s4lR_FOAxi3I>4fvt0OrwU>Zn(3gh z1YiGSZ;llPe+_4oX{61NrEyYM{7r7-hedvKTMzo*%TIpI3n`5W2DXn&3}r&pzIDf# zX}mT{`^du9e2}v?TN8*qa%Zvi4j3K)EKW)TeShC2Iht8a69zf!WCLeec_(kpc3Jav zy@R`L1Mpc6ni+W}x%+5noK!9e74?q=EdJ>wc(2U-({~*JO`lpfnq@}2(x~i)l>Mg~ zOr7D3OdQpe_1}6h2J<)dXX`GD{xO6%ifq916nx|L$9AY`8v!gE1b)Kw$DLm42?9b9 z#!F_(zYUPee;qK~-F3sE6Ki*dHk`J&h9G_Y-gQ-L0M#Q#Qp91X}Tfo0Z zl=}q@E6}E;3F522bi4pxdX|jrHJdIw4gP3#X)~39=KvnOks+N{~JB;IOC+a@<(Q>)> z^(z;{qTcRpWQ5T9FP2I#CI^SuZK7!tIyMHoLk_SbA@B=dcEZ(^>n67LgdJE4a9cz` zdUG5hY5izI`^|=>b8^Ijy?3|npv`tLeIJC#$3~VQDeZ^6V*kH!)$g8UrFp13Y`^s$ zQFg7|XSFA^1#(p6ce8M;(sDk(Tl~mSZ^fS@1|r8+(Ie$`f^;J-CUDZJ)zvmgY2wyY z+AZL--D=~h%dyQ2mmVC~*Vnt7xwZp9ziA48Bx{I8=c+W(VD-LZ>mN*7A6p)C)2J5a zY^`oL@f6nHwK$|r7WF8KBsS?{_2rS%!HOtC+o@ps| z&DT?x7;MLg+nMeK!DUlP(Lx)hz)Y<{u6l}i*2X2uZf+ka0D4$?`u2OQzPU7jn8?=8 z3~MNM>|Ia!yhh&0!7?I>x;TKpm7WCk+%66+|ICF~ z$Jih$wS+Pp+IsGsJh&(4{?k7Xli;22;o-vZr$jhS5Ir?cxQy-j0y8x3rl?Oy!5uwajq)7cp}q zCNedk{)n#w^Hp&1{qoUofXd3;CQEsDjZ@CS&$}ZVRfnTgy(65n?6-Hw*I$P2^ft{` z1xr>zz^;HEI40@N;hdx`K`H!T1lLR438fUb4zd`F>_$&2(xKVYKhv@Zh@BGXQKtPM z(IS2FQ1$X9(aYrC$MbeaI&4*oX)diq8%UPC=n?4@PZZe%i`e?&O?buf>pn6qOShk0 zpgIwOs;~IUqQFCjhVB+T4%S@o6%$QW@vZY^kKg&M9c;Fgkbp&RF(x}w4755z7;E8` zMGLs-Y8zkhfXK#e-k24yI)}E8QTv_Gkq-J4maEMky14BOd49(#Fi=LYgX(K^Fq$(- zJ|BA+B{077O)9IV%Ih!-_IXOeMAbCiCY!}z;0ZtrW8L;bt5UU}u{0UL*oxKb>BZnV&!nXwi^oA*q4H@iS=@SE~63n}Rh}87H!O+I!?o?~g&_K{o#J$9bu9dr!9sZt`H|JR>*+%-7`ilmyQ)=PWlb0&8wg7k5KwM%;cmumA|uZksN0mK6QsUPez#F;XLwXqpHXuJjY&&g zt*bpGRb{?@6xj}$>+DX*WBPgLFzyKQ+0J+?;jd`mbO4^a?-FVqbI z=t7}#=A5Zbb?|6w!1%88nsG|?>uI;5p4!i1yeAdDPYk^U0&~TvG zqNVAo$J@0jpYiG%DE=}i+4(KQBRAHn&QxznQ3nYn{@y;E5{@Hx2P@cOEn_*W2ScX3 z%lx`fLC?8V)E28F(3QV$X6p9Q)I9F2mNNE^CRtc2^D3m5OP#z3vv8nqAKp7f-KL?! zQr>nt6l83qeCajahrSTO3y^dugxEk&7+moM`>e1-&)-J%@2XZ}?Zf#PR+3%Q_)>?w z7T8Mt;l=3|-SJSET-_Qt%2KYW>8j1Od4;-}hkCu?D~$IHWMv7&I>JpJBCH_^C7e$p zTs#YsOjF?~`AmPma$7FWB)6Y>o~7%Bv2Ap;s9Q|rdsVJc3M{Sw@x9>C+IC*1uY_ksVC$FZ}{s^Vfn_4%zQJ7aJXhdfa{8s(Bb69=VbN^G>Vwm>Ri(hVOM5Y~9g37;S8wcJs@<9;Yw_S<6f} zYis|S){>kEseaYX&B+3Ps42rZ?9HN@#Fgxyhvzz|2i)J{?YQXbSBV2_bMb;J+8hJd zzwN9?^^bt~c`rV%yBMqV#=f}_3l6K-!Ust*l)3s(^oO|VgX-3p&{0v-s2vju^4J#ZKMq&`9xnyQf3zwr}5%<~dla_BNW1 zl$AkvY}IrGiqrd4CKpxhtTK7HGZ1ya?xbd+m<~w#==KRQiyj}G$ z%h&+tD?C-*J4of8LP`DarkheHr|P6iMuVH!joFbn$;&rsTOIDQC@m=O>irGwKZ#9L z+t>9s3HFDCAt&T4+bijkO5%O<#lTT^C&tgLPcpyW=6Vq9a;Xu~sFeOY9~eKwAn4>} zrFo?!wjm?@?Q(^*EFJh`i_quavHhdpv3*|%W}36r#yekbTq67jcZOS#(pi~+`w{1r z>VME=Fz3^YH+FC8lWOU>LZzR~{0hC|LhF4)DCA8j-G{^R5WtZ}SB+c3!RgUNM6@&$ z_#^#)We$Im9mNG8)96{r`6HB4(L4Z@(tx0}3x5)Fh9E#IV!%bj{mpI8WKj7FJh>5e z`TynMm4_(>_Fy1Ab#gS&?1(R-gasuA99P@|Ippqf4k?{MM>`Us_|S1_pvHKl>yT$8 z>W^OfWJm}|N&%^FGXxJLr9MhViYY4$kI0s<`>&T}sCa@8o0wFXLQPtsF`JHOvW|p6U4X!xez|89GdhISX6?<9P!|5n*Em zht8ej6n<)aQ6p?l>R9bmpdkY_%Yifqz44FS*Ik01sXV3~sd)1$+r3r>jOze`_u~rd z5>yx$bX89?O@ L@u=vbMZo_6*pP2# literal 0 HcmV?d00001 diff --git a/docs/index.rst b/docs/index.rst index 336cd2d47f..194e76df24 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -57,6 +57,7 @@ Transformer Engine documentation examples/te_gemma/tutorial_generation_gemma_with_te.ipynb examples/onnx/onnx_export.ipynb examples/te_jax_integration.ipynb + examples/op_fuser/op_fuser.rst .. toctree:: :hidden: diff --git a/tests/pytorch/test_fusible_ops.py b/tests/pytorch/test_fusible_ops.py index 5d1a5ce61d..f95f065d78 100644 --- a/tests/pytorch/test_fusible_ops.py +++ b/tests/pytorch/test_fusible_ops.py @@ -3428,8 +3428,15 @@ def test_custom_basic_op( ) -> None: """Custom basic op""" - class CustomScaleOp(te.ops.BasicOperation): - """Custom op that applies a learnable scale""" + class LearnableScale(te.ops.BasicOperation): + """Custom op that applies a learnable scale + + This class is as an example in the op fuser guide at + docs/examples/op_fuser/op_fuser.rst (see "Implementing a + basic operation"). Any changes made to this class should + also be made there. + + """ def __init__(self) -> None: super().__init__() @@ -3442,23 +3449,19 @@ def op_forward( self, ctx: OperationContext, input_: torch.Tensor, - prev_op_grad_output_quantizer: Optional[Quantizer], - next_op_input_quantizer: Optional[Quantizer], + **unused, ) -> torch.Tensor: + out = self.scale * input_ ctx.save_for_backward(self.scale, input_) - return self.scale * input_ + return out def op_backward( self, ctx: OperationContext, grad_output: torch.Tensor, - ) -> torch.Tensor: - ( - scale, - input_, - ) = ctx.saved_tensors - grad_scale = torch.inner(input_.reshape(-1), grad_output.reshape(-1)) - grad_scale = grad_scale.reshape(()) + ) -> tuple[torch.Tensor, Iterable[Optional[torch.Tensor]]]: + scale, input_ = ctx.saved_tensors + grad_scale = torch.inner(input_.reshape(-1), grad_output.reshape(-1)).reshape(()) grad_input = scale * grad_output return grad_input, (grad_scale,) @@ -3485,7 +3488,7 @@ def op_backward( y_ref.backward(dy_ref) # Implementation with fusible operation - op = CustomScaleOp() + op = LearnableScale() forward = te.ops.Sequential(te.ops.Identity(), op, te.ops.Identity()) with torch.no_grad(): op.scale.copy_(w_test) @@ -3502,7 +3505,112 @@ def op_backward( torch.testing.assert_close(dx_test, x_ref.grad, **tols) torch.testing.assert_close(dw_test, w_ref.grad, **tols) - def test_custom_forward_fused_op( + def test_custom_forward_fused_op1( + self, + *, + shape: Iterable[int] = (5, 11), + dtype: torch.dtype = torch.float32, + device: torch.device = "cuda", + ): + """Custom fused op in forward pass""" + + class ForwardAxpy(te.ops.FusedOperation): + """Custom op that computes BLAS SAXPY in forward pass + + This class is as an example in the op fuser guide at + docs/examples/op_fuser/op_fuser.rst (see "Implementing a + fused operation"). Any changes made to this class should + also be made there. + + """ + + _enabled = True + + def __init__( + self, + scale: te.ops.ConstantScale, + add: te.ops.AddExtraInput, + ) -> None: + super().__init__((scale, add)) + + def fuser_forward( + self, + basic_op_ctxs: list[OperationContext], + input_: torch.Tensor, + basic_op_extra_inputs: list[tuple[torch.Tensor, ...]], + **unused, + ) -> tuple[torch.Tensor, list[tuple[torch.Tensor, ...]]]: + scale_op, add_op = self.basic_ops + extra_input = basic_op_extra_inputs[1][0] # Extra input to add op + out = scale_op.scale * input_ + extra_input + scale_ctx, add_ctx = basic_op_ctxs # No state needed for backward + return ( + out, # Output + [(), ()], # Extra outputs for each basic op + ) + + def fuse_axpy_ops( + ops: list[te.ops.FusibleOperation], + **unused, + ) -> list[te.ops.FusibleOperation]: + """Apply fusion the first time this function is called""" + if ForwardAxpy._enabled: + ForwardAxpy._enabled = False + else: + return ops + out = [] + window, ops = ops[:2], ops[2:] + while len(window) == 2: + if isinstance(window[0], te.ops.ConstantScale) and isinstance( + window[1], te.ops.AddExtraInput + ): + window = [ForwardAxpy(*window)] + else: + out.append(window[0]) + window = window[1:] + window, ops = window + ops[:1], ops[1:] + out.extend(window + ops) + return out + + # Random data + scale = 0.5 + x1_ref, x1_test = make_reference_and_test_tensors( + shape, + test_dtype=dtype, + test_device=device, + ) + x2_ref, x2_test = make_reference_and_test_tensors( + shape, + test_dtype=dtype, + test_device=device, + ) + dy_ref, dy_test = make_reference_and_test_tensors( + shape, + test_dtype=dtype, + test_device=device, + requires_grad=False, + ) + + # Plain PyTorch implementation + y_ref = scale * x1_ref + x2_ref + y_ref.backward(dy_ref) + + # Implementation with fusible operation + te.ops.register_forward_fusion(fuse_axpy_ops) + model = te.ops.Sequential( + te.ops.ConstantScale(scale=scale), + te.ops.AddExtraInput(), + ) + y_test = model(x1_test, x2_test) + y_test.backward(dy_test) + + # Check values + tols = dtype_tols(dtype) + assert_close(y_test, y_ref, **tols) + assert_close_grads(x1_test, x1_ref, **tols) + assert_close_grads(x2_test, x2_ref, **tols) + + def test_custom_forward_fused_op2( self, *, shape: Iterable[int] = (7, 11), diff --git a/transformer_engine/pytorch/ops/basic/activation.py b/transformer_engine/pytorch/ops/basic/activation.py index 9e23bb3fb1..13cb519c19 100644 --- a/transformer_engine/pytorch/ops/basic/activation.py +++ b/transformer_engine/pytorch/ops/basic/activation.py @@ -152,7 +152,7 @@ class GELU(_ActivationOperation): \text{GELU}(x) \approx \frac{x}{2} \left( 1 + \tanh\left( 0.797x+0.036 x^3 \right) \right) - See `Gaussian Error Linear Units (GELUs)`__. + See `Gaussian Error Linear Units (GELUs) `__. """ @@ -183,8 +183,8 @@ class GLU(_ActivationOperation): the first half of the input tensor, while PyTorch applies it to the second half. - See `Language Modeling with Gated Convolutional Networks`__ - and `GLU Variants Improve Transformer`__. + See `Language Modeling with Gated Convolutional Networks `__ + and `GLU Variants Improve Transformer `__. """ @@ -219,7 +219,7 @@ class GEGLU(_ActivationOperation): the first half of the input tensor, while PyTorch applies it to the second half. - See `GLU Variants Improve Transformer`__. + See `GLU Variants Improve Transformer `__. """ @@ -233,8 +233,8 @@ def _activation_backward_impl(self, *args, **kwargs) -> torch.Tensor: class QGELU(_ActivationOperation): r"""Quick Gaussian Error Linear Unit - Quick GELU from `HuggingFace`__ - and `paper`__. + Quick GELU from `HuggingFace `__ + and `paper `__. .. math:: @@ -316,7 +316,7 @@ class ReGLU(_ActivationOperation): the first half of the input tensor, while PyTorch applies it to the second half. - See `GLU Variants Improve Transformer`__. + See `GLU Variants Improve Transformer `__. """ @@ -334,7 +334,7 @@ class SReLU(_ActivationOperation): \text{SReLU}(x) = \max(x^2,0) - See `Primer: Searching for Efficient Transformers for Language Modeling`__. + See `Primer: Searching for Efficient Transformers for Language Modeling `__. """ diff --git a/transformer_engine/pytorch/ops/basic/add_extra_input.py b/transformer_engine/pytorch/ops/basic/add_extra_input.py index 47f2b6e248..fc3ca9cade 100644 --- a/transformer_engine/pytorch/ops/basic/add_extra_input.py +++ b/transformer_engine/pytorch/ops/basic/add_extra_input.py @@ -30,7 +30,7 @@ class AddExtraInput(BasicOperation): feature and most users are discouraged from it. In-place operations break some autograd assumptions and they can result in subtle, esoteric bugs. - Compare to `MakeExtraOutput`, which does a similar operation in + Compare to ``MakeExtraOutput``, which does a similar operation in the backward pass. """ diff --git a/transformer_engine/pytorch/ops/basic/basic_linear.py b/transformer_engine/pytorch/ops/basic/basic_linear.py index e640f3ffb1..48376a297f 100644 --- a/transformer_engine/pytorch/ops/basic/basic_linear.py +++ b/transformer_engine/pytorch/ops/basic/basic_linear.py @@ -48,8 +48,8 @@ def _wait_async(handle: Optional[Any]) -> None: class BasicLinear(BasicOperation): """Apply linear transformation: :math:`y = x A^T` - This is a drop-in replacement for `torch.nn.Linear` with - `bias=False`. + This is a drop-in replacement for ``torch.nn.Linear`` with + ``bias=False``. Parameters ---------- @@ -61,27 +61,27 @@ class BasicLinear(BasicOperation): Tensor device dtype : torch.dtype, default = default dtype Tensor datatype - tensor_parallel_mode : {`None`, "column", "row"}, default = `None` + tensor_parallel_mode : {None, "column", "row"}, default = None Mode for tensor parallelism tensor_parallel_group : torch.distributed.ProcessGroup, default = world group Process group for tensor parallelism - sequence_parallel : bool, default = `False` + sequence_parallel : bool, default = False Whether to apply sequence parallelism together with tensor parallelism, i.e. distributing input or output tensors along outer dimension (sequence or batch dim) when not distributing along inner dimension (embedding dim) rng_state_tracker_function : callable - Function that returns `CudaRNGStatesTracker`, which is used + Function that returns ``CudaRNGStatesTracker``, which is used for model-parallel weight initialization - accumulate_into_main_grad : bool, default = `False` + accumulate_into_main_grad : bool, default = False Whether to directly accumulate weight gradients into the - weight's `main_grad` attribute instead of relying on PyTorch - autograd. The weight's `main_grad` must be set externally and - there is no guarantee that `grad` will be set or be - meaningful. This is primarily intented to integrate with + weight's ``main_grad`` attribute instead of relying on PyTorch + autograd. The weight's ``main_grad`` must be set externally + and there is no guarantee that ``grad`` will be set or be + meaningful. This is primarily intended to integrate with Megatron-LM. This argument along with weight tensor having - attribute 'overwrite_main_grad' set to True will overwrite - `main_grad` instead of accumulating. + attribute ``overwrite_main_grad`` set to ``True`` will + overwrite ``main_grad`` instead of accumulating. userbuffers_options, dict, optional Options for overlapping tensor-parallel communication with compute using Userbuffers. This feature is highly @@ -184,7 +184,7 @@ def _canonicalize_tensor_parallelism( Parameters ---------- - mode: {`None`, "column", "row"} + mode: {None, "column", "row"} Mode for tensor parallelism process_group: torch.distributed.ProcessGroup Process group for tensor parallelism @@ -200,7 +200,7 @@ def _canonicalize_tensor_parallelism( Returns ------- - mode: {`None`, "column", "row"} + mode: {None, "column", "row"} Mode for tensor parallelism process_group: torch.distributed.ProcessGroup Process group for tensor parallelism @@ -446,18 +446,18 @@ def _functional_forward( Output tensor beta: float, optional Scaling factor applied to original value of out when accumulating into it - accumulate_into_out: bool, default = `False` + accumulate_into_out: bool, default = False Add result to output tensor instead of overwriting - tensor_parallel_mode: {`None`, "column", "row"}, default = `None` + tensor_parallel_mode: {None, "column", "row"}, default = None Mode for tensor parallelism tensor_parallel_group: torch.distributed.ProcessGroup, default = world group Process group for tensor parallelism - sequence_parallel: bool, default = `False` + sequence_parallel: bool, default = False Whether to apply sequence parallelism together with tensor parallelism, i.e. distributing input or output tensors along outer dimension (sequence or batch dim) when not distributing along inner dimension (embedding dim) - with_quantized_compute: bool, default = `False` + with_quantized_compute: bool, default = False Whether to perform compute with quantized data. input_quantizer: Quantizer, optional Builder class for quantized input tensor. @@ -465,10 +465,10 @@ def _functional_forward( Builder class for quantized weight tensor. output_quantizer: Quantizer, optional Builder class for quantized output tensor. - input_requires_grad: bool, default = `True` + input_requires_grad: bool, default = True Whether the loss gradient w.r.t. the input tensor is required in the backward pass. - weight_requires_grad: bool, default = `True` + weight_requires_grad: bool, default = True Whether the loss gradient w.r.t. the weight tensor is required in the backward pass. @@ -477,11 +477,11 @@ def _functional_forward( torch.Tensor Output tensor torch.Tensor, optional - Input tensor, ready for use in backward pass. `None` is + Input tensor, ready for use in backward pass. ``None`` is returned if loss gradient w.r.t. the weight tensor is not required. torch.Tensor, optional - Weight tensor, ready for use in backward pass. `None` is + Weight tensor, ready for use in backward pass. ``None`` is returned if loss gradient w.r.t. the input tensor is not required. @@ -682,24 +682,24 @@ def _functional_backward( Loss gradient w.r.t. weight tensor grad_weight_beta: float, optional Scaling factor applied to original value of grad_weight when accumulating into it - accumulate_into_grad_weight: bool, default = `False` + accumulate_into_grad_weight: bool, default = False Add result to weight grad instead of overwriting grad_input: torch.Tensor, optional Loss gradient w.r.t. input tensor grad_input_beta: float, optional Scaling factor applied to original value of grad_input when accumulating into it - accumulate_into_grad_input: bool, default = `False` + accumulate_into_grad_input: bool, default = False Add result to input grad instead of overwriting - tensor_parallel_mode: {`None`, "column", "row"}, default = `None` + tensor_parallel_mode: {None, "column", "row"}, default = None Mode for tensor parallelism tensor_parallel_group: torch.distributed.ProcessGroup, default = world group Process group for tensor parallelism - sequence_parallel: bool, default = `False` + sequence_parallel: bool, default = False Whether to apply sequence parallelism together with tensor parallelism, i.e. distributing input or output tensors along outer dimension (sequence or batch dim) when not distributing along inner dimension (embedding dim) - with_quantized_compute: bool, default = `False` + with_quantized_compute: bool, default = False Whether to perform compute with quantized data. input_quantizer: Quantizer, optional Builder class for quantized input tensor. diff --git a/transformer_engine/pytorch/ops/basic/bias.py b/transformer_engine/pytorch/ops/basic/bias.py index 8b60251088..d580f84866 100644 --- a/transformer_engine/pytorch/ops/basic/bias.py +++ b/transformer_engine/pytorch/ops/basic/bias.py @@ -18,7 +18,7 @@ class Bias(BasicOperation): """Apply additive bias - This is equivalent to the additive bias in `torch.nn.Linear`. + This is equivalent to the additive bias in ``torch.nn.Linear``. Parameters ---------- @@ -28,7 +28,7 @@ class Bias(BasicOperation): Tensor device dtype : torch.dtype, default = default dtype Tensor datatype - tensor_parallel : bool, default = `False` + tensor_parallel : bool, default = False Whether to distribute input tensor and bias tensors along inner dimension tensor_parallel_group : torch.distributed.ProcessGroup, default = world group diff --git a/transformer_engine/pytorch/ops/basic/grouped_linear.py b/transformer_engine/pytorch/ops/basic/grouped_linear.py index eb8a67600d..b44e77b0c6 100644 --- a/transformer_engine/pytorch/ops/basic/grouped_linear.py +++ b/transformer_engine/pytorch/ops/basic/grouped_linear.py @@ -65,7 +65,7 @@ class GroupedLinear(BasicOperation): weight's ``main_grad`` attribute instead of relying on PyTorch autograd. The weight's ``main_grad`` must be set externally and there is no guarantee that `grad` will be set or be - meaningful. This is primarily intented to integrate with + meaningful. This is primarily intended to integrate with Megatron-LM. This argument along with weight tensor having attribute ``overwrite_main_grad`` set to True will overwrite ``main_grad`` instead of accumulating. diff --git a/transformer_engine/pytorch/ops/basic/layer_norm.py b/transformer_engine/pytorch/ops/basic/layer_norm.py index 631f0fafc9..3fda5145c6 100644 --- a/transformer_engine/pytorch/ops/basic/layer_norm.py +++ b/transformer_engine/pytorch/ops/basic/layer_norm.py @@ -31,7 +31,7 @@ class LayerNorm(BasicOperation): r"""Layer Normalization Applies Layer Normalization over a mini-batch of inputs as described in - the paper `Layer Normalization `__ + the paper `Layer Normalization `__ . .. math:: y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \varepsilon}} * \gamma + \beta @@ -51,9 +51,9 @@ class LayerNorm(BasicOperation): Tensor device dtype : torch.dtype, default = default dtype Tensor datatype - zero_centered_gamma : bool, default = 'False' - If `True`, the :math:`\gamma` parameter is initialized to zero - and the calculation changes to + zero_centered_gamma : bool, default = False + If ``True``, the :math:`\gamma` parameter is initialized to + zero and the calculation changes to .. math:: y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \varepsilon}} * (1 + \gamma) + \beta diff --git a/transformer_engine/pytorch/ops/basic/make_extra_output.py b/transformer_engine/pytorch/ops/basic/make_extra_output.py index 61caaaf65d..0d9c870262 100644 --- a/transformer_engine/pytorch/ops/basic/make_extra_output.py +++ b/transformer_engine/pytorch/ops/basic/make_extra_output.py @@ -35,7 +35,7 @@ class MakeExtraOutput(BasicOperation): operations break some autograd assumptions and they can result in subtle, esoteric bugs. - Compare to `AddExtraInput`, which does a similar operation in the + Compare to ``AddExtraInput``, which does a similar operation in the backward pass. """ diff --git a/transformer_engine/pytorch/ops/basic/quantize.py b/transformer_engine/pytorch/ops/basic/quantize.py index d126b554b5..fa3efc3807 100644 --- a/transformer_engine/pytorch/ops/basic/quantize.py +++ b/transformer_engine/pytorch/ops/basic/quantize.py @@ -18,14 +18,14 @@ class Quantize(BasicOperation): """Quantize tensor data - Uses recipe from `autocast` context. When called outside - of an `autocast` context, this is an identity operation. + Uses recipe from ``autocast`` context. When called outside + of an ``autocast`` context, this is an identity operation. Parameters ---------- - forward : bool, default = `True` + forward : bool, default = True Perform quantization in forward pass - backward : bool, default = `False` + backward : bool, default = False Perform quantization in backward pass """ diff --git a/transformer_engine/pytorch/ops/basic/reshape.py b/transformer_engine/pytorch/ops/basic/reshape.py index f8ae86fecd..4a171c294b 100644 --- a/transformer_engine/pytorch/ops/basic/reshape.py +++ b/transformer_engine/pytorch/ops/basic/reshape.py @@ -20,7 +20,7 @@ class Reshape(BasicOperation): """Reshape tensor - See `torch.reshape`. + See ``torch.reshape``. Parameters ---------- diff --git a/transformer_engine/pytorch/ops/basic/rmsnorm.py b/transformer_engine/pytorch/ops/basic/rmsnorm.py index 3179d0a447..1d8d8be971 100644 --- a/transformer_engine/pytorch/ops/basic/rmsnorm.py +++ b/transformer_engine/pytorch/ops/basic/rmsnorm.py @@ -32,7 +32,7 @@ class RMSNorm(BasicOperation): Applies Root Mean Square Layer Normalization over a mini-batch of inputs as described in the paper - `Root Mean Square Layer Normalization `__ + `Root Mean Square Layer Normalization `__ . .. math:: y = \frac{x}{\sqrt{\mathrm{Var}[x] + \varepsilon}} * \gamma @@ -50,8 +50,8 @@ class RMSNorm(BasicOperation): Tensor device dtype : torch.dtype, default = default dtype Tensor datatype - zero_centered_gamma : bool, default = 'False' - If `True`, the :math:`\gamma` parameter is initialized to zero + zero_centered_gamma : bool, default = False + If ``True``, the :math:`\gamma` parameter is initialized to zero and the calculation changes to .. math:: diff --git a/transformer_engine/pytorch/ops/basic/swiglu.py b/transformer_engine/pytorch/ops/basic/swiglu.py index eaffbeee02..b4427df41a 100644 --- a/transformer_engine/pytorch/ops/basic/swiglu.py +++ b/transformer_engine/pytorch/ops/basic/swiglu.py @@ -46,7 +46,7 @@ class SwiGLU(BasicOperation): The Sigmoid Linear Unit (SiLU) gating function is also known as the swish function. See - ``GLU Variants Improve Transformer``__. + `GLU Variants Improve Transformer `__. Parameters ---------- @@ -189,14 +189,18 @@ def op_backward( class ClampedSwiGLU(BasicOperation): r"""GPT-OSS - Implementation based on ``GPT-OSS``__. + Implementation based on `GPT-OSS `__. This activation has two differences compared to the original SwiGLU 1. Both gate and pre-activations are clipped based on parameter limit. 2. Activation uses sigmoid(alpha * x) instead of sigmoid(x) used in Swish activation. - .. warning:: The input tensor is chunked along the last dimension to get gates/pre-activations which is different - from GPT OSS implementation where the gates/pre-activations are assumed to be interleaved in the input tensor. + .. warning:: + + The input tensor is chunked along the last dimension to get + gates/pre-activations which is different from GPT OSS + implementation where the gates/pre-activations are assumed to + be interleaved in the input tensor. Parameters ---------- diff --git a/transformer_engine/pytorch/ops/fused/userbuffers_backward_linear.py b/transformer_engine/pytorch/ops/fused/userbuffers_backward_linear.py index 90ade030c8..fbaf69d75d 100644 --- a/transformer_engine/pytorch/ops/fused/userbuffers_backward_linear.py +++ b/transformer_engine/pytorch/ops/fused/userbuffers_backward_linear.py @@ -125,18 +125,18 @@ def _functional_backward( Tensor datatype grad_weight: torch.Tensor, optional Loss gradient w.r.t. weight tensor - accumulate_into_grad_weight: bool, default = `False` + accumulate_into_grad_weight: bool, default = False Add result to weight grad instead of overwriting - tensor_parallel_mode: {`None`, "column", "row"}, default = `None` + tensor_parallel_mode: {None, "column", "row"}, default = None Mode for tensor parallelism tensor_parallel_group: torch.distributed.ProcessGroup, default = world group Process group for tensor parallelism - sequence_parallel: bool, default = `False` + sequence_parallel: bool, default = False Whether to apply sequence parallelism together with tensor parallelism, i.e. distributing input or output tensors along outer dimension (sequence or batch dim) when not distributing along inner dimension (embedding dim) - with_quantized_compute: bool, default = `False` + with_quantized_compute: bool, default = False Whether to perform compute with quantized data. input_quantizer: Quantizer, optional Builder class for quantized input tensor. diff --git a/transformer_engine/pytorch/ops/fused/userbuffers_forward_linear.py b/transformer_engine/pytorch/ops/fused/userbuffers_forward_linear.py index 6ef9bf083b..0d3e1d0416 100644 --- a/transformer_engine/pytorch/ops/fused/userbuffers_forward_linear.py +++ b/transformer_engine/pytorch/ops/fused/userbuffers_forward_linear.py @@ -115,16 +115,16 @@ def _functional_forward( Tensor device dtype: torch.dtype Tensor datatype - tensor_parallel_mode: {`None`, "column", "row"}, default = `None` + tensor_parallel_mode: {None, "column", "row"}, default = None Mode for tensor parallelism tensor_parallel_group: torch.distributed.ProcessGroup, default = world group Process group for tensor parallelism - sequence_parallel: bool, default = `False` + sequence_parallel: bool, default = False Whether to apply sequence parallelism together with tensor parallelism, i.e. distributing input or output tensors along outer dimension (sequence or batch dim) when not distributing along inner dimension (embedding dim) - with_quantized_compute: bool, default = `False` + with_quantized_compute: bool, default = False Whether to perform compute with quantized data. input_quantizer: Quantizer, optional Builder class for quantized input tensor. @@ -132,10 +132,10 @@ def _functional_forward( Builder class for quantized weight tensor. output_quantizer: Quantizer, optional Builder class for quantized output tensor. - input_requires_grad: bool, default = `True` + input_requires_grad: bool, default = True Whether the loss gradient w.r.t. the input tensor is required in the backward pass. - weight_requires_grad: bool, default = `True` + weight_requires_grad: bool, default = True Whether the loss gradient w.r.t. the weight tensor is required in the backward pass. ub_comm_name: str diff --git a/transformer_engine/pytorch/ops/fuser.py b/transformer_engine/pytorch/ops/fuser.py index 7fe6ea37ed..bd3bc94b60 100644 --- a/transformer_engine/pytorch/ops/fuser.py +++ b/transformer_engine/pytorch/ops/fuser.py @@ -31,7 +31,7 @@ def _split_tuple(t: tuple, idx: int) -> tuple[tuple, tuple]: def _is_graph_capturing() -> bool: - """Whether function is called within `make_graphed_callables` + """Whether function is called within ``make_graphed_callables`` Avoid circular import with lazy import. @@ -519,6 +519,8 @@ def register_forward_fusion( The fusion function should have the following signature: + .. code-block:: python + func(ops, *, recipe) -> updated ops Parameters @@ -545,6 +547,8 @@ def register_backward_fusion( The fusion function should have the following signature: + .. code-block:: python + func(ops, *, recipe) -> updated ops Parameters diff --git a/transformer_engine/pytorch/ops/linear.py b/transformer_engine/pytorch/ops/linear.py index d5829b0c50..c6ca4786b8 100644 --- a/transformer_engine/pytorch/ops/linear.py +++ b/transformer_engine/pytorch/ops/linear.py @@ -23,7 +23,7 @@ class Linear(FusedOperation): """Apply linear transformation: :math:`y = x A^T + b` - This is a drop-in replacement for `torch.nn.Linear`. + This is a drop-in replacement for ``torch.nn.Linear``. Parameters ---------- @@ -31,17 +31,17 @@ class Linear(FusedOperation): Inner dimension of input tensor out_features : int Inner dimension of output tensor - bias : bool, default = `True` + bias : bool, default = True Apply additive bias device : torch.device, default = default CUDA device Tensor device dtype : torch.dtype, default = default dtype Tensor datatype - tensor_parallel_mode : {`None`, "column", "row"}, default = `None` + tensor_parallel_mode : {None, "column", "row"}, default = None Mode for tensor parallelism tensor_parallel_group : torch.distributed.ProcessGroup, default = world group Process group for tensor parallelism - sequence_parallel : bool, default = `False` + sequence_parallel : bool, default = False Whether to apply sequence parallelism together with tensor parallelism, i.e. distributing input or output tensors along outer dimension (sequence or batch dim) when not distributing @@ -49,12 +49,12 @@ class Linear(FusedOperation): rng_state_tracker_function : callable Function that returns CudaRNGStatesTracker, which is used for model-parallel weight initialization - accumulate_into_main_grad : bool, default = `False` + accumulate_into_main_grad : bool, default = False Whether to directly accumulate weight gradients into the - weight's `main_grad` attribute instead of relying on PyTorch - autograd. The weight's `main_grad` must be set externally and - there is no guarantee that `grad` will be set or be - meaningful. This is primarily intented to integrate with + weight's ``main_grad`` attribute instead of relying on PyTorch + autograd. The weight's ``main_grad`` must be set externally and + there is no guarantee that ``grad`` will be set or be + meaningful. This is primarily intended to integrate with Megatron-LM. """ diff --git a/transformer_engine/pytorch/ops/op.py b/transformer_engine/pytorch/ops/op.py index 47286dfced..54b3f00117 100644 --- a/transformer_engine/pytorch/ops/op.py +++ b/transformer_engine/pytorch/ops/op.py @@ -94,7 +94,7 @@ def fuser_forward( several of this function's arguments are lists of arguments to forward functions of corresponding basic ops. - Called by `OperationFuser`. + Called by ``OperationFuser``. Parameters ---------- @@ -141,7 +141,7 @@ def fuser_backward( several of this function's arguments are lists of arguments to backward functions of corresponding basic ops. - Called by `OperationFuser`. + Called by ``OperationFuser``. Parameters ---------- diff --git a/transformer_engine/pytorch/ops/sequential.py b/transformer_engine/pytorch/ops/sequential.py index a0db3cd2d0..592ddae23a 100644 --- a/transformer_engine/pytorch/ops/sequential.py +++ b/transformer_engine/pytorch/ops/sequential.py @@ -15,10 +15,10 @@ class Sequential(torch.nn.Module): - """Sequential container for fusible operations + """Sequential container for fusible operations. - This is a drop-in replacement for `torch.nn.Sequential`, with - support for fusing `FusibleOperation`s. + This is a drop-in replacement for ``torch.nn.Sequential`` with + support for fusing ``FusibleOperation`` s. Parameters ----------