%%% ==================================================== %%% BibTeX-file{ %%% author = "Quinlan, James", %%% version = "1.01", %%% date = "1 Apr 2024", %%% filename = "mixed.bib", %%% keywords = "Iterative Refinement, %%% mixed-precision", %%% ill-conditioned systems." %%% AMS = "65G50, 65F10" %%% } %%% See https://www.bibtex.com/format/ %%% ==================================================== % A % ------------------------ @article{abdelfattah2021survey, title={A survey of numerical linear algebra methods utilizing mixed-precision arithmetic}, author={Abdelfattah, Ahmad and Anzt, Hartwig and Boman, Erik G and Carson, Erin and Cojean, Terry and Dongarra, Jack and Fox, Alyson and Gates, Mark and Higham, Nicholas J and Li, Xiaoye S and others}, journal={The International Journal of High Performance Computing Applications}, volume={35}, number={4}, pages={344--369}, year={2021}, publisher={SAGE Publications Sage UK: London, England}, annote = {The efficient utilization of mixed-precision numerical linear algebra algorithms can offer attractive acceleration to scientific computing applications. Especially with the hardware integration of low-precision special-function units designed for machine learning applications, the traditional numerical algorithms community must reconsider the floating point formats used in the distinct operations to leverage the available computing power efficiently. This work comprehensively surveys mixed-precision numerical linear algebra routines, including the underlying concepts, theoretical background, and experimental results for dense and sparse linear algebra problems.}, doi = {10.1177/10943420211003313} } @article{al2006lu, title={{LU}-decomposition with iterative refinement for solving sparse linear systems}, author={Al-Kurdi, Ahmad and Kincaid, David R}, journal={Journal of computational and applied mathematics}, volume={185}, number={2}, pages={391--403}, year={2006}, publisher={Elsevier} } @article{amestoy2023combining, title={Combining sparse approximate factorizations with mixed-precision iterative refinement}, author={Amestoy, Patrick and Buttari, Alfredo and Higham, Nicholas J and L’Excellent, Jean-Yves and Mary, Th{\'e}o and Vieuble, Bastien}, journal={ACM Transactions on Mathematical Software}, volume={49}, number={1}, pages={1--29}, year={2023}, publisher={ACM New York, NY} } @article{amestoy2021five, title={Five-Precision GMRES-based iterative refinement}, author={Amestoy, Patrick and Buttari, Alfredo and Higham, Nicholas J and l'Excellent, Jean-Yves and Mary, Th{\'e}o and Vieuble, Bastien}, year={2021} } @book{anderson1999lapack, title={LAPACK users' guide}, author={Anderson, Edward and Bai, Zhaojun and Bischof, Christian and Blackford, L Susan and Demmel, James and Dongarra, Jack and Du Croz, Jeremy and Greenbaum, Anne and Hammarling, Sven and McKenney, Alan and others}, year={1999}, publisher={SIAM} } @inproceedings{arafa2020verified, title={Verified instruction-level energy consumption measurement for {NVIDIA GPU}s}, author={Arafa, Yehia and ElWazir, Ammar and ElKanishy, Abdelrahman and Aly, Youssef and Elsayed, Ayatelrahman and Badawy, Abdel-Hameed and Chennupati, Gopinath and Eidenbenz, Stephan and Santhi, Nandakishore}, booktitle={Proceedings of the 17th ACM International Conference on Computing Frontiers}, pages={60--70}, year={2020} } % B % ------------------------ @article{baboulin2009accelerating, title={Accelerating scientific computations with mixed precision algorithms}, author={Baboulin, Marc and Buttari, Alfredo and Dongarra, Jack and Kurzak, Jakub and Langou, Julie and Langou, Julien and Luszczek, Piotr and Tomov, Stanimire}, journal={Computer Physics Communications}, volume={180}, number={12}, pages={2526--2533}, year={2009}, publisher={Elsevier} } @article{baglama1998adaptively, title={Adaptively preconditioned GMRES algorithms}, author={Baglama, James and Calvetti, Daniela and Golub, Gene H and Reichel, Lothar}, journal={SIAM Journal on Scientific Computing}, volume={20}, number={1}, pages={243--269}, year={1998}, publisher={SIAM} } @article{bailey2015high, title={High-precision arithmetic in mathematical physics}, author={Bailey, David H and Borwein, Jonathan M}, journal={Mathematics}, volume={3}, number={2}, pages={337--367}, year={2015}, publisher={MDPI} } @book{barrett1994templates, title={Templates for the solution of linear systems: Building blocks for iterative methods}, author={Barrett, Richard and Berry, Michael and Chan, Tony F and Demmel, James and Donato, June and Dongarra, Jack and Eijkhout, Victor and Pozo, Roldan and Romine, Charles and Van der Vorst, Henk}, year={1994}, publisher={SIAM}, keywords={iterative solvers}, annote={Covers Jacobi, Gauss-Seidel, SOR, conjugate gradient, MINRES, GMRES, BiCG, QMR, CGS, Bi-CGSTAB, and Chebyshev Iteration. Preconditioners are also mentioned.} } @article{bauer1963optimally, title={Optimally scaled matrices}, author={Bauer, Friedrich L}, journal={Numerische Mathematik}, volume={5}, number={1}, pages={73--87}, year={1963}, publisher={Springer} } @article{bazan2018schultz, title={Schultz matrix iteration based method for stable solution of discrete ill-posed problems}, author={Baz{\'a}n, Ferm{\'\i}n SV and Boos, Everton}, journal={Linear Algebra and its Applications}, volume={554}, pages={120--145}, year={2018}, publisher={Elsevier}, keywords = {Schultz-Jones-Meyers. Approximate inverse.}, annote={} } @book{ben2003generalized, title={Generalized inverses: theory and applications}, author={Ben-Israel, Adi and Greville, Thomas NE}, volume={15}, year={2003}, publisher={Springer Science \& Business Media} } % Schultz-Jones-Meyers @article{ben1966iterative, title={On iterative computation of generalized inverses and associated projections}, author={Ben-Israel, Adi and Cohen, Dan}, journal={SIAM Journal on Numerical Analysis}, volume={3}, number={3}, pages={410--419}, year={1966}, publisher={SIAM} } @inproceedings{betkaoui2010comparing, title={Comparing performance and energy efficiency of {FPGA}s and {GPU}s for high productivity computing}, author={Betkaoui, Brahim and Thomas, David B and Luk, Wayne}, booktitle={2010 International Conference on Field-Programmable Technology}, pages={94--101}, year={2010}, organization={IEEE}, keywords={}, annote={} } @article{blanchard2020class, title={A class of fast and accurate summation algorithms}, author={Blanchard, Pierre and Higham, Nicholas J and Mary, Theo}, journal={SIAM journal on scientific computing}, volume={42}, number={3}, pages={A1541--A1557}, year={2020}, publisher={SIAM}, keywords={}, annote={} } % Brown @article{brown2010profile, title={Profile-directed specialization of custom floating-point hardware}, author={Brown, Ashley W}, year={2010}, publisher={Imperial College London}, keywords={}, annote={} } @inproceedings{buoncristiani2020evaluating, title={Evaluating the numerical stability of posit arithmetic}, author={Buoncristiani, Nicholas and Shah, Sanjana and Donofrio, David and Shalf, John}, booktitle={2020 IEEE International Parallel and Distributed Processing Symposium (IPDPS)}, pages={612--621}, year={2020}, organization={IEEE} } % Burden Faires 4th @misc{burden1988numerical, title={Numerical analysis}, author={Burden, Richard L and Faires, J Douglas}, year={1988}, publisher={PWS Publishing Co.}, keywords={}, annote={} } % Businger - Optimally scaled matrices @article{businger1968matrices, title={Matrices which can be optimally scaled}, author={Businger, PA}, journal={Numerische Mathematik}, volume={12}, pages={346--348}, year={1968}, publisher={Springer}, keywords={}, annote={}, abstract={Sufficient conditions are given for matrix to be optimally scaled to minimize its condition number.} } @article{buttari2008using, title={Using mixed precision for sparse matrix computations to enhance the performance while achieving 64-bit accuracy}, author={Buttari, Alfredo and Dongarra, Jack and Kurzak, Jakub and Luszczek, Piotr and Tomov, Stanimir}, journal={ACM Transactions on Mathematical Software (TOMS)}, volume={34}, number={4}, pages={1--22}, year={2008}, publisher={ACM New York, NY, USA}, keywords={}, annote={} } @article{buttari2007mixed, title={Mixed precision iterative refinement techniques for the solution of dense linear systems}, author={Buttari, Alfredo and Dongarra, Jack and Langou, Julie and Langou, Julien and Luszczek, Piotr and Kurzak, Jakub}, journal={The International Journal of High Performance Computing Applications}, volume={21}, number={4}, pages={457--466}, year={2007}, publisher={Sage Publications Sage UK: London, England}, keywords={}, annote={} } @inproceedings{bylina2013mixed, title={Mixed precision iterative refinement techniques for the wz factorization}, author={Bylina, Beata and Bylina, Jaros{\l}aw}, booktitle={2013 Federated Conference on Computer Science and Information Systems}, pages={425--431}, year={2013}, organization={IEEE}, keywords={}, annote={} } % C % ------------------------ @inproceedings{carmichael2019deep, title={Deep positron: A deep neural network using the posit number system}, author={Carmichael, Zachariah and Langroudi, Hamed F and Khazanov, Char and Lillie, Jeffrey and Gustafson, John L and Kudithipudi, Dhireesha}, booktitle={2019 Design, Automation \& Test in Europe Conference \& Exhibition (DATE)}, pages={1421--1426}, year={2019}, organization={IEEE}, keywords={}, annote={} } @article{carson2022mixed, title={Mixed precision s-step Lanczos and conjugate gradient algorithms}, author={Carson, Erin and Gergelits, Tom{\'a}{\v{s}} and Yamazaki, Ichitaro}, journal={Numerical Linear Algebra with Applications}, volume={29}, number={3}, pages={e2425}, year={2022}, publisher={Wiley Online Library}, keywords={}, annote={} } % https://github.com/eccarson @article{carson2018accelerating, title={Accelerating the solution of linear systems by iterative refinement in three precisions}, author={Carson, Erin and Higham, Nicholas J}, journal={SIAM Journal on Scientific Computing}, volume={40}, number={2}, pages={A817--A847}, year={2018}, publisher={SIAM}, keywords={}, annote={} } @article{carson2017new, title={A new analysis of iterative refinement and its application to accurate solution of ill-conditioned sparse linear systems}, author={Carson, Erin and Higham, Nicholas J}, journal={SIAM Journal on Scientific Computing}, volume={39}, number={6}, pages={A2834--A2856}, year={2017}, publisher={SIAM}, keywords={Iterative refinement}, annote={Iterative refinement} } % C @article{chang2023survey, title={A survey on evaluation of large language models}, author={Chang, Yupeng and Wang, Xu and Wang, Jindong and Wu, Yuan and Zhu, Kaijie and Chen, Hao and Yang, Linyi and Yi, Xiaoyuan and Wang, Cunxiang and Wang, Yidong and others}, journal={arXiv preprint arXiv:2307.03109}, year={2023} } @inproceedings{chien2019posit, title={Posit NPB: Assessing the precision improvement in HPC scientific applications}, author={Chien, Steven WD and Peng, Ivy B and Markidis, Stefano}, booktitle={International Conference on Parallel Processing and Applied Mathematics}, pages={301--310}, year={2019}, organization={Springer}, annote={posits yield higher precision than IEEE formats using the same number of bits. software-guided mixed-precision, tapered preci- sion can achieve higher precision than IEEE 754 format using the same number of bits.} } @article{choi2020learning, title={Learning sparse low-precision neural networks with learnable regularization}, author={Choi, Yoojin and El-Khamy, Mostafa and Lee, Jungwon}, journal={IEEE Access}, volume={8}, pages={96963--96974}, year={2020}, publisher={IEEE}, keywords={}, annote = {We consider learning deep neural networks (DNNs) that consist of low-precision weights and activations for efficient inference of fixed-point operations. In training low-precision networks, gradient descent in the backward pass is performed with high-precision weights. In contrast, quantized low-precision weights and activations are used in the forward pass to calculate the loss function for training. Thus, the gradient descent becomes suboptimal, and accuracy loss follows. To reduce the mismatch in the forward and backward passes, we utilize mean squared quantization error (MSQE) regularization. In particular, we propose using a learnable regularization coefficient with the MSQE regularizer to reinforce the convergence of high-precision weights to their quantized values. We also investigate how partial L2 regularization can be similarly employed for weight pruning. Finally, combining weight pruning, quantization, and entropy coding, we establish a low-precision DNN compression pipeline. In our experiments, the proposed method yields low-precision MobileNet and ShuffleNet models on ImageNet classification with state-of-the-art compression ratios of 7.13 and 6.79, respectively. Moreover, we examine our method for image super-resolution networks to produce 8-bit low-precision models at negligible performance loss.} } @article{choquette2021nvidia, title={{NVIDIA} {A}100 tensor core {GPU}: Performance and innovation}, author={Choquette, Jack and Gandhi, Wishwesh and Giroux, Olivier and Stam, Nick and Krashinsky, Ronny}, journal={IEEE Micro}, volume={41}, number={2}, pages={29--35}, year={2021}, doi={10.1109/mm.2021.3061394}, publisher={IEEE}, keywords={NVIDIA, A100, GPU}, annote={} } @article{cline1979estimate, title={An estimate for the condition number of a matrix}, author={Cline, Alan K and Moler, Cleve B and Stewart, George W and Wilkinson, James H}, journal={SIAM Journal on Numerical Analysis}, volume={16}, number={2}, pages={368--375}, year={1979}, publisher={SIAM}, keywords={Condition Number Estimate}, annote={Condition Number Estimate} } @article{cococcioni2020fast, title={Fast deep neural networks for image processing using posits and ARM scalable vector extension}, author={Cococcioni, Marco and Rossi, Federico and Ruffaldi, Emanuele and Saponara, Sergio}, journal={Journal of Real-Time Image Processing}, volume={17}, number={3}, pages={759--771}, year={2020}, publisher={Springer}, keywords={DNN, Image processing, Posits, ARM}, annote={} } @article{cordero2023improving, title={Improving Newton--Schulz Method for Approximating Matrix Generalized Inverse by Using Schemes with Memory}, author={Cordero, Alicia and Maim{\'o}, Javier G and Torregrosa, Juan R and Vassileva, Mar{\'\i}a P}, journal={Mathematics}, volume={11}, number={14}, pages={3161}, year={2023}, publisher={MDPI} } @article{courbariaux2014training, title={Training deep neural networks with low precision multiplications}, author={Courbariaux, Matthieu and Bengio, Yoshua and David, Jean-Pierre}, journal={arXiv preprint arXiv:1412.7024}, year={2014}, keywords={Training DNN, low-precision}, annote={} } @article{cox1990reliable, title={Reliable numerical computation}, author={Cox, Maurice G and Hammarling, Sven}, journal={Oxford: Clarendon Press}, year={1990}, keywords={Reliable}, annote={} } % D % ------------------------ @patent{dally2021log, title={Neural Network Accelerator Using Logarithmic-based Arithmetic}, author={William James Dally, et al.}, url={https://uspto.report/patent/app/20210056397}, year={2021}, keywords={LNS}, annote={} } @article{davis2011university, title={The {U}niversity of {F}lorida {S}parse {M}atrix {C}ollection}, author={Davis, Timothy A and Hu, Yifan}, journal={ACM Transactions on Mathematical Software (TOMS)}, volume={38}, number={1}, pages={1--25}, year={2011}, publisher={ACM New York, NY, USA} , keywords={sparse matrix collection, test-suite}, annote={} } @article{dawson2018reliable, title={Reliable low precision simulations in land surface models}, author={Dawson, Andrew and D{\"u}ben, Peter D and MacLeod, David A and Palmer, Tim N}, journal={Climate Dynamics}, volume={51}, number={7}, pages={2657--2666}, year={2018}, publisher={Springer}, keywords={}, annote={Climate models using low-precision} } @article{de2011designing, title={Designing custom arithmetic data paths with FloPoCo}, author={De Dinechin, Florent and Pasca, Bogdan}, journal={IEEE Design \& Test of Computers}, volume={28}, number={4}, pages={18--27}, year={2011}, publisher={IEEE}, keywords={}, annote={} } @article{demmel2004accurate, title={Accurate and efficient floating point summation}, author={Demmel, James and Hida, Yozo}, journal={SIAM Journal on Scientific Computing}, volume={25}, number={4}, pages={1214--1248}, year={2004}, publisher={SIAM} } @article{demmel2006error, title={Error bounds from extra-precise iterative refinement}, author={Demmel, James and Hida, Yozo and Kahan, William and Li, Xiaoye S and Mukherjee, Sonil and Riedy, E Jason}, journal={ACM Transactions on Mathematical Software (TOMS)}, volume={32}, number={2}, pages={325--351}, year={2006}, publisher={ACM New York, NY, USA}, keywords={}, annote={} } @article{dongarra1990set, title={A set of level 3 basic linear algebra subprograms}, author={Dongarra, Jack J and Du Croz, Jeremy and Hammarling, Sven and Duff, Iain S}, journal={ACM Transactions on Mathematical Software (TOMS)}, volume={16}, number={1}, pages={1--17}, year={1990}, publisher={ACM New York, NY, USA} } @article{duff2007towards, title={Towards stable mixed pivoting strategies for the sequential and parallel solution of sparse symmetric indefinite systems}, author={Duff, Iain S and Pralet, St{\'e}phane}, journal={SIAM Journal on Matrix Analysis and Applications}, volume={29}, number={3}, pages={1007--1024}, year={2007}, publisher={SIAM}, keywords={}, annote={} } @article{duff1989sparse, title={Sparse matrix test problems}, author={Duff, Iain S and Grimes, Roger G and Lewis, John G}, journal={ACM Transactions on Mathematical Software (TOMS)}, volume={15}, number={1}, pages={1--14}, year={1989}, publisher={ACM New York, NY, USA} } % E % ------------------------ @article{elble2012scaling, title={Scaling linear optimization problems prior to application of the simplex method}, author={Elble, Joseph M and Sahinidis, Nikolaos V}, journal={Computational Optimization and Applications}, volume={52}, pages={345--371}, year={2012}, publisher={Springer} } @article{eliezer2022energy, title={Energy awareness in low precision neural networks}, author={Eliezer, Nurit Spingarn and Banner, Ron and Hoffer, Elad and Ben-Yaakov, Hilla and Michaeli, Tomer}, journal={arXiv preprint arXiv:2202.02783}, year={2022} } % F % ------------------------ % Fang @article{fang2016open, title={Open-source variable-precision floating-point library for major commercial fpgas}, author={Fang, Xin and Leeser, Miriam}, journal={ACM Transactions on Reconfigurable Technology and Systems (TRETS)}, volume={9}, number={3}, pages={1--17}, year={2016}, publisher={ACM New York, NY, USA}, keywords={}, annote={} } @article{fasi2021numerical, title={Numerical behavior of NVIDIA tensor cores}, author={Fasi, Massimiliano and Higham, Nicholas J and Mikaitis, Mantas and Pranesh, Srikara}, journal={PeerJ Computer Science}, volume={7}, pages={e330}, year={2021}, publisher={PeerJ Inc.} } @article{feldman2018fujitsu, title={Fujitsu reveals details of processor that will power {Post-K} supercomputer}, author={Feldman, Michael}, journal={Retrieved March}, volume={26}, pages={2019}, year={2018}, keywords={}, annote={} } @article{fenner1977optimally, title={Optimally scalable matrices}, author={Fenner, TI and Loizou, G}, journal={Philosophical Transactions of the Royal Society of London. Series A, Mathematical and Physical Sciences}, volume={287}, number={1345}, pages={307--349}, year={1977}, publisher={The Royal Society London} } @book{faires2012numerical, title={Numerical methods, 4th}, author={Faires, J Douglas and Burden, Richard L}, year={2012}, publisher={Cengage Learning}, keywords={}, annote={} } @book{forsythe1967computer, title={Computer solution of linear algebraic systems}, author={Forsythe, George E and Moler, Cleve B}, year={1967}, publisher={Prentice-Hall, Englewood Cliffs, NJ, USA}, keywords={}, annote={} } @book{forsythe1977computer, title={Computer methods for mathematical computations}, author={George E. Forsythe and Michael A. Malcolm and Cleve B. Moler}, year={1977}, publisher={Prentice-Hall, Englewood Cliffs, NJ, USA}, keywords={}, annote={} } @article{fousse2007mpfr, title={MPFR: A multiple-precision binary floating-point library with correct rounding}, author={Fousse, Laurent and Hanrot, Guillaume and Lef{\`e}vre, Vincent and P{\'e}lissier, Patrick and Zimmermann, Paul}, journal={ACM Transactions on Mathematical Software (TOMS)}, volume={33}, number={2}, pages={13--es}, year={2007}, publisher={ACM New York, NY, USA}, keywords={}, annote={} } % G % ------------------------ @article{goldberg1991every, title={What every computer scientist should know about floating-point arithmetic}, author={Goldberg, David}, journal={ACM computing surveys (CSUR)}, volume={23}, number={1}, pages={5--48}, year={1991}, publisher={ACM New York, NY, USA} } @article{golub1996matrix, title={Matrix computations}, author={Golub, Gene H and Van Loan, Charles F}, journal={Math. Sci., {J}ohns {H}opkins {U}niversity {P}ress, Baltimore, MD}, year={1996}, keywords={}, annote={} } @book{golub2013matrix, title={Matrix computations}, author={Golub, Gene H and Van Loan, Charles F}, year={2013}, publisher={JHU press}, keywords={}, annote={} } % Matrix multiplication (MxM) is a cornerstone application for both high-performance computing and safety-critical applications. @article{goto2008anatomy, title={Anatomy of high-performance matrix multiplication}, author={Goto, Kazushige and Geijn, Robert A van de}, journal={ACM Transactions on Mathematical Software (TOMS)}, volume={34}, number={3}, pages={1--25}, year={2008}, publisher={ACM New York, NY, USA} } @inproceedings{gottschling2007representation, title={Representation-transparent matrix algorithms with scalable performance}, author={Gottschling, Peter and Wise, David S and Adams, Michael D}, booktitle={Proceedings of the 21st annual international conference on Supercomputing}, pages={116--125}, year={2007}, keywords={}, annote={} } % G @inproceedings{graves2013speech, title={Speech recognition with deep recurrent neural networks}, author={Graves, Alex and Mohamed, Abdel-rahman and Hinton, Geoffrey}, booktitle={2013 IEEE international conference on acoustics, speech and signal processing}, pages={6645--6649}, year={2013}, organization={Ieee} } @inproceedings{gupta2015deep, title={Deep learning with limited numerical precision}, author={Gupta, Suyog and Agrawal, Ankur and Gopalakrishnan, Kailash and Narayanan, Pritish}, booktitle={International conference on machine learning}, pages={1737--1746}, year={2015}, organization={PMLR}, keywords={}, annote={} } @misc{gustafson2020generalized, title={A generalized framework for matching arithmetic format to application requirements}, author={Gustafson, LJ}, year={2020}, keywords={}, annote={} } @book{gustafson2017end, title={The end of error: Unum computing}, author={Gustafson, John L}, year={2017}, publisher={Chapman and Hall/CRC}, keywords={posit}, annote={Unum computing and Posit development. This book covers the development of posits. Use it as the first to cite posits.} } @article{gustafson2017beating, title={Beating floating point at its own game: Posit arithmetic}, author={Gustafson, John L and Yonemoto, Isaac T}, journal={Supercomputing Frontiers and Innovations}, volume={4}, number={2}, pages={71--86}, year={2017}, keywords={posit arithmetic}, annote={} } @incollection{gustavson1972some, title={Some basic techniques for solving sparse systems of linear equations}, author={Gustavson, Fred G}, booktitle={Sparse matrices and their applications}, pages={41--52}, year={1972}, publisher={Springer}, keywords={}, annote={} } @article{granlund1996gnu, title={Gnu mp}, author={Granlund, Torbj{\"o}rn}, journal={The GNU Multiple Precision Arithmetic Library}, volume={2}, number={2}, year={1996}, keywords={}, annote={} } % H % ------------------------ @article{hager1984condition, title={Condition estimates}, author={Hager, William W}, journal={SIAM Journal on scientific and statistical computing}, volume={5}, number={2}, pages={311--316}, year={1984}, publisher={SIAM} } % http://www.ii.uib.no/~lennart/drgrad/Hager1984.pdf @inproceedings{haidar2017investigating, title={Investigating half precision arithmetic to accelerate dense linear system solvers}, author={Haidar, Azzam and Wu, Panruo and Tomov, Stanimire and Dongarra, Jack}, booktitle={Proceedings of the 8th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems}, pages={1--8}, year={2017}, keywords={}, annote={} } @inproceedings{haidar2018harnessing, title={Harnessing {GPU} tensor cores for fast {FP16} arithmetic to speed up mixed-precision iterative refinement solvers}, author={Haidar, Azzam and Tomov, Stanimire and Dongarra, Jack and Higham, Nicholas J}, booktitle={SC18: International Conference for High Performance Computing, Networking, Storage and Analysis}, pages={603--613}, year={2018}, organization={IEEE}, keywords={}, annote={} } @inproceedings{haidar2018design, title={The design of fast and energy-efficient linear solvers: On the potential of half-precision arithmetic and iterative refinement techniques}, author={Haidar, Azzam and Abdelfattah, Ahmad and Zounon, Mawussi and Wu, Panruo and Pranesh, Srikara and Tomov, Stanimire and Dongarra, Jack}, booktitle={International Conference on Computational Science}, pages={586--600}, year={2018}, organization={Springer}, keywords={}, annote={As parallel computers approach the exascale, power efficiency in High-performance computing (HPC) systems is of increasing concern. Exploiting the hardware features and algorithms effectively achieves power efficiency and addresses the energy constraints in modern and future HPC systems. In this work, we present a novel design and implementation of an energy-efficient solution for dense linear systems of equations, which are at the heart of large-scale HPC applications. The proposed energy-efficient linear system solvers are based on two main components: (1) iterative refinement techniques and (2) reduced precision computing features in modern accelerators and co-processors. While most energy efficiency approaches aim to reduce consumption with a minimal performance penalty, our method improves performance and efficiency. Compared to highly optimized linear system solvers, our kernels are up to 2$\times$ faster to deliver the same accuracy solution and reduce the energy consumption to half on Intel KNL architectures. Using the tensor cores available in the NVIDIA V100 PCIe GPUs efficiently, the speedups can be up to 4$\times$ with more than 80\% reduction in energy consumption.} } @book{hammer2012c++, title={C++ Toolbox for Verified Computing I: Basic Numerical Problems Theory, Algorithms, and Programs}, author={Hammer, Rolf and Hocks, Matthias and Kulisch, Ulrich and Ratz, Dietmar}, year={2012}, publisher={Springer Science \& Business Media} } % Hannun DEEP LEARNING @article{hannun2014deep, title={Deep speech: Scaling up end-to-end speech recognition}, author={Hannun, Awni and Case, Carl and Casper, Jared and Catanzaro, Bryan and Diamos, Greg and Elsen, Erich and Prenger, Ryan and Satheesh, Sanjeev and Sengupta, Shubho and Coates, Adam and others}, journal={arXiv preprint arXiv:1412.5567}, year={2014} } @article{he2001using, title={Using accurate arithmetics to improve numerical reproducibility and stability in parallel applications}, author={He, Yun and Ding, Chris HQ}, journal={The Journal of Supercomputing}, volume={18}, number={3}, pages={259--277}, year={2001}, publisher={Springer}, keywords={}, annote={} } @inproceedings{he2016deep, title={Deep residual learning for image recognition}, author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, pages={770--778}, year={2016} } % Higham Catherine and Desmond @article{higham2019deep, title={Deep learning: An introduction for applied mathematicians}, author={Higham, Catherine F and Higham, Desmond J}, journal={Siam review}, volume={61}, number={4}, pages={860--891}, year={2019}, publisher={SIAM} } % HIGHAM @article{higham1993accuracy, title={The accuracy of floating point summation}, author={Higham, Nicholas J}, journal={SIAM Journal on Scientific Computing}, volume={14}, number={4}, pages={783--799}, year={1993}, publisher={SIAM} } @book{higham2002accuracy, title={Accuracy and stability of numerical algorithms}, author={Higham, Nicholas J}, year={2002}, publisher={SIAM}, keywords={}, annote={} } @article{higham2019squeezing, title={Squeezing a matrix into half-precision, with an application to solving linear systems}, author={Higham, Nicholas J and Pranesh, Srikara and Zounon, Mawussi}, journal={SIAM Journal on Scientific Computing}, volume={41}, number={4}, pages={A2536--A2551}, year={2019}, publisher={SIAM} } @article{higham1997iterative, title={Iterative refinement for linear systems and LAPACK}, author={Higham, Nicholas J}, journal={IMA Journal of Numerical Analysis}, volume={17}, number={4}, pages={495--509}, year={1997}, publisher={Oxford University Press}, keywords={}, annote={} } @article{higham2019new, title={A new preconditioner that exploits low-rank approximations to factorization error}, author={Higham, Nicholas J and Mary, Theo}, journal={SIAM Journal on Scientific Computing}, volume={41}, number={1}, pages={A59--A82}, year={2019}, publisher={SIAM}, keywords={}, annote={} } @article{higham2019simulating, title={Simulating low precision floating-point arithmetic}, author={Higham, Nicholas J and Pranesh, Srikara}, journal={SIAM Journal on Scientific Computing}, volume={41}, number={5}, pages={C585--C602}, year={2019}, publisher={SIAM}, keywords={}, annote={} } @techreport{hittinger2019variable, title={Variable Precision Computing}, author={Hittinger, JA and Lindstrom, PG and Bhatia, H and Bremer, PT and Copeland, DM and Chand, KK and Fox, AL and Lloyd, GS and Menon, H and Morrison, GD and others}, year={2019}, institution={Lawrence Livermore National Lab.(LLNL), Livermore, CA (United States)}, keywords={}, annote={} } @inproceedings{ho2017exploiting, title={Exploiting half precision arithmetic in {NVIDIA} {GPU}s}, author={Ho, Nhut-Minh and Wong, Weng-Fai}, booktitle={2017 IEEE High Performance Extreme Computing Conference (HPEC)}, pages={1--7}, year={2017}, organization={IEEE}, keywords={}, annote={} } @article{hook2019max, title={Max-balanced Hungarian scalings}, author={Hook, James and Pestana, Jennifer and Tisseur, Francoise and Hogg, Jonathan}, journal={SIAM Journal on Matrix Analysis and Applications}, volume={40}, number={1}, pages={320--346}, year={2019}, publisher={SIAM} } @inproceedings{horowitz20141, title={1.1 computing's energy problem (and what we can do about it)}, author={Horowitz, Mark}, booktitle={2014 IEEE International Solid-State Circuits Conference Digest of Technical Papers (ISSCC)}, pages={10--14}, year={2014}, organization={IEEE}, keywords={}, annote={} } @article{hu2021lora, title={Lora: Low-rank adaptation of large language models}, author={Hu, Edward J and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu}, journal={arXiv preprint arXiv:2106.09685}, year={2021} } % I % ------------------------ @article{ieee1985standard, author={}, journal={ANSI/IEEE Std 754-1985}, title={{IEEE} Standard for Binary Floating-Point Arithmetic}, year={1985}, volume={}, number={}, pages={1-20}, doi={10.1109/ieeestd.1985.82928}, keywords={}, annote={} } @article{ieee2008standard, author={}, journal={IEEE Std 754-2008}, title={{IEEE} Standard for Floating-Point Arithmetic}, year={2008}, volume={}, number={}, pages={1-70}, doi={10.1109/ieeestd.2008.4610935}, keywords={}, annote={} } @article{ieee2019standard, author={}, journal={IEEE Std 754-2019 (Revision of IEEE 754-2008)}, title={IEEE Standard for Floating-Point Arithmetic}, year={2019}, volume={}, number={}, pages={1-84}, doi={10.1109/ieeestd.2019.8766229}, keywords={}, annote={} } @misc{intel2018bfloat16, title={{BFLOAT16} - {H}ardware {N}umerics {D}efinition}, author={{Intel Corporation}}, pages={5--7}, year={2018}, publisher={Intel}, keywords={}, annote={} } %journal={}, %volume={}, %number={}, %https://www.intel.com/content/dam/develop/external/us/en/documents/bf16-hardware-numerics-definition-white-paper.pdf @article{intel:2018, title={{BFLOAT16} - {H}ardware {N}umerics {D}efinition}, author={{Intel Corporation}}, journal={}, volume={}, number={}, pages={}, year={2018}, publisher={Intel}, url={https://tinyurl.com/y8ybct4}, keywords={}, annote={} } % https://www.intel.com/content/dam/develop/external/us/en/documents/bf16-hardware-numerics-definition-white-paper.pdf % J % ------------------------ @inproceedings{jaiswal2018architecture, title={Architecture generator for type-3 unum posit adder/subtractor}, author={Jaiswal, Manish Kumar and So, Hayden K-H}, booktitle={2018 IEEE International Symposium on Circuits and Systems (ISCAS)}, pages={1--5}, year={2018}, organization={IEEE}, keywords={}, annote={} } @inproceedings{jaiswal2018universal, title={Universal number posit arithmetic generator on {FPGA}}, author={Jaiswal, Manish Kumar and So, Hayden K-H}, booktitle={2018 Design, Automation \& Test in Europe Conference \& Exhibition (DATE)}, pages={1159--1162}, year={2018}, organization={IEEE}, keywords={}, annote={} } % Posit number system format includes a run-time varying exponent component, defined by a combination of regime-bit (with run-time varying length) and exponent-bit (with the size of up to ES bits, the exponent size). This also leads to a run-time variation in its mantissa field size and position. This run-time variation in posit format poses a hardware design challenge. Being a recent development, posit lacks for its adequate hardware arithmetic architectures. Thus, this paper is aimed toward the posit arithmetic algorithmic development and their generic hardware generator. It is focused on basic posit arithmetic (floating-point to posit conversion, posit-to-floating-point conversion, addition/subtraction, and multiplication). These are also demonstrated on an FPGA platform. The target is to develop an open-source solution for generating basic posit arithmetic architectures with parameterized choices. This contribution would enable further exploration and evaluation of the posit system. @article{jankowski1977iterative, title={Iterative refinement implies numerical stability}, author={Jankowski, M and Wo{\'z}niakowski, H}, journal={BIT Numerical Mathematics}, volume={17}, pages={303--311}, year={1977}, publisher={Springer}, keywords={}, annote={} } @article{johnson2018rethinking, title={Rethinking floating point for deep learning}, author={Johnson, Jeff}, journal={arXiv preprint arXiv:1811.01721}, year={2018}, keywords={}, annote={} } @inproceedings{jouppi2017datacenter, title={In-datacenter performance analysis of a tensor processing unit}, author={Jouppi, Norman P and Young, Cliff and Patil, Nishant and Patterson, David and Agrawal, Gaurav and Bajwa, Raminder and Bates, Sarah and Bhatia, Suresh and Boden, Nan and Borchers, Al and others}, booktitle={Proceedings of the 44th annual international symposium on computer architecture}, pages={1--12}, year={2017}, keywords={}, annote={} } % K % ------------------------ @article{kalamkar2019study, title={A study of {BFLOAT16} for deep learning training}, author={Kalamkar, Dhiraj and Mudigere, Dheevatsa and Mellempudi, Naveen and Das, Dipankar and Banerjee, Kunal and Avancha, Sasikanth and Vooturi, Dharma Teja and Jammalamadaka, Nataraj and Huang, Jianyu and Yuen, Hector and others}, journal={arXiv preprint arXiv:1905.12322}, year={2019} } @book{kindratenko2014numerical, title={Numerical computations with GPUs}, author={Kindratenko, Volodymyr}, year={2014}, publisher={Springer}, keywords={}, annote={} } % Klatte @book{klatte2012c, title={C-XSC: A C++ class library for extended scientific computing}, author={Klatte, Rudi and Kulisch, Ulrich and Wiethoff, Andreas and Rauch, Michael}, year={2012}, publisher={Springer Science \& Business Media}, keywords={}, annote={} } % Kharya @article{kharya2020tensorfloat, title={TensorFloat-32 in the {A100 GPU} accelerates {AI} training {HPC} up to 20x}, author={Kharya, P}, journal={NVIDIA Corporation, Tech. Rep}, year={2020}, keywords={}, annote={} } %url={https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/}, @article{knight2014symmetry, title={A symmetry preserving algorithm for matrix scaling}, author={Knight, Philip A and Ruiz, Daniel and U{\c{c}}ar, Bora}, journal={SIAM journal on Matrix Analysis and Applications}, volume={35}, number={3}, pages={931--955}, year={2014}, publisher={SIAM} } @misc{kulisch1976grundlagen, title={Grundlagen des numerischen Rechnens—Mathematische Begr{\"u}ndung der Rechnerarithmetik, Reihe Informatik}, author={Kulisch, U}, journal={Band}, volume={19}, year={1976}, publisher={Wissenschaftsverlag des Bibliographischen Instituts Mannheim} } @book{kulisch2008computer, title={Computer arithmetic and validity: theory, implementation, and applications}, author={Kulisch, Ulrich}, year={2008}, publisher={Walter de Gruyter} } @book{kulisch2014computer, title={Computer arithmetic in theory and practice}, author={Kulisch, Ulrich W and Miranker, Willard L}, year={2014}, publisher={Academic press} } @article{kulisch2011exact, title={The exact dot product as basic tool for long interval arithmetic}, author={Kulisch, Ulrich and Snyder, Van}, journal={Computing}, volume={91}, number={3}, pages={307--313}, year={2011}, publisher={Springer} } @book{kulisch2014new, title={A new approach to scientific computation}, author={Kulisch, Ulrich W and Miranker, Willard L}, volume={7}, year={2014}, publisher={Elsevier} } @article{kulisch1997fifth, title={The fifth floating-point operation for top-performance computers}, author={Kulisch, Ulrich}, journal={Universitat Karlsruhe}, year={1997}, publisher={Citeseer} } @article{kulisch1986arithmetic, title={The arithmetic of the digital computer: A new approach}, author={Kulisch, Ulrich W and Miranker, Willard L}, journal={Siam Review}, volume={28}, number={1}, pages={1--40}, year={1986}, publisher={SIAM} } @article{kulisch2011very, title={Very fast and exact accumulation of products}, author={Kulisch, Ulrich}, journal={Computing}, volume={91}, pages={397--405}, year={2011}, publisher={Springer} } % L % -------------------------- @book{lambers2019explorations, title={Explorations in numerical analysis}, author={Lambers, James V and Mooney, Amber Sumner and Montiforte, Vivian A}, year={2019}, publisher={World Scientific}, keywords={}, annote={} } @book{lambers2024enamlj, title={Explorations in numerical analysis and deep learning with Julia}, author={Lambers, James V and Mooney, Amber Sumner and Montiforte, Vivian A and Quinlan, James}, year={2024}, publisher={World Scientific}, keywords={}, annote={} } @inproceedings{langou2006exploiting, title={Exploiting the performance of 32 bit floating point arithmetic in obtaining 64 bit accuracy (revisiting iterative refinement for linear systems)}, author={Langou, Julie and Langou, Julien and Luszczek, Piotr and Kurzak, Jakub and Buttari, Alfredo and Dongarra, Jack}, booktitle={SC'06: Proceedings of the 2006 ACM/IEEE conference on Supercomputing}, pages={50--50}, year={2006}, organization={IEEE}, keywords={}, annote={} } @article{larsson1993scaling, title={On scaling linear programs—Some experimental results}, author={Larsson, T}, journal={Optimization}, volume={27}, number={4}, pages={355--373}, year={1993}, publisher={Taylor \& Francis} } @inproceedings{lehoczky2018high, title={High-level. NET software implementations of unum type I and posit with simultaneous FPGA implementation using hastlayer}, author={Leh{\'o}czky, Zolt{\'a}n and Retzler, Andr{\'a}s and T{\'o}th, Rich{\'a}rd and Szab{\'o}, {\'A}lmos and Farkas, Benedek and Somogyi, Kriszti{\'a}n}, booktitle={Proceedings of the Conference for Next Generation Arithmetic}, pages={1--7}, year={2018} } @book{leon2020linear, title={Linear algebra with applications}, author={Leon, Steven J and De Pillis, Lisette}, year={2020}, publisher={Pearson}, keywords={}, annote={} } @article{li2010family, title={A family of iterative methods for computing the approximate inverse of a square matrix and inner inverse of a non-square matrix}, author={Li, Weiguo and Li, Zhi}, journal={Applied Mathematics and Computation}, volume={215}, number={9}, pages={3433--3442}, year={2010}, publisher={Elsevier} } @article{li2002design, title={Design, implementation and testing of extended and mixed precision BLAS}, author={Li, Xiaoye S and Demmel, James W and Bailey, David H and Henry, Greg and Hida, Yozo and Iskandar, Jimmy and Kahan, William and Kang, Suh Y and Kapur, Anil and Martin, Michael C and others}, journal={ACM Transactions on Mathematical Software (TOMS)}, volume={28}, number={2}, pages={152--205}, year={2002}, publisher={ACM New York, NY, USA} } @inproceedings{li1998making, title={Making sparse Gaussian elimination scalable by static pivoting}, author={Li, Xiaoye S and Demmel, James W}, booktitle={SC'98: Proceedings of the 1998 ACM/IEEE Conference on Supercomputing}, pages={34--34}, year={1998}, organization={IEEE}, keywords={}, annote={} } @article{lindquist2021accelerating, title={Accelerating restarted GMRES with mixed precision arithmetic}, author={Lindquist, Neil and Luszczek, Piotr and Dongarra, Jack}, journal={IEEE Transactions on Parallel and Distributed Systems}, volume={33}, number={4}, pages={1027--1037}, year={2021}, publisher={IEEE}, keywords={}, annote={} } @inproceedings{lindquist2020improving, title={Improving the performance of the GMRES method using mixed-precision techniques}, author={Lindquist, Neil and Luszczek, Piotr and Dongarra, Jack}, booktitle={Driving Scientific and Engineering Discoveries Through the Convergence of HPC, Big Data and AI: 17th Smoky Mountains Computational Sciences and Engineering Conference, SMC 2020, Oak Ridge, TN, USA, August 26-28, 2020, Revised Selected Papers 17}, pages={51--66}, year={2020}, organization={Springer}, keywords={}, annote={} } @techreport{lloyd2020zfp, title={ZFP Hardware Implementation}, author={Lloyd, G Scott and Lindstrom, Peter G}, year={2020}, institution={Lawrence Livermore National Lab.(LLNL), Livermore, CA (United States)}, keywords={}, annote={} } @article{lu2020evaluations, title={Evaluations on deep neural networks training using posit number system}, author={Lu, Jinming and Fang, Chao and Xu, Mingyang and Lin, Jun and Wang, Zhongfeng}, journal={IEEE Transactions on Computers}, volume={70}, number={2}, pages={174--187}, year={2020}, publisher={IEEE}, keywords={}, annote={} } @article{liu2020new, title={A new iterative refinement for ill-conditioned linear systems based on discrete gradient}, author={Liu, Kai and Yang, Jie and Liu, Changying}, journal={Japan Journal of Industrial and Applied Mathematics}, volume={37}, pages={803--818}, year={2020}, publisher={Springer} } % M % -------------------------- @incollection{ma2015solving, title={Solving multiscale linear programs using the simplex method in quadruple precision}, author={Ma, Ding and Saunders, Michael A}, booktitle={Numerical analysis and optimization}, pages={223--235}, year={2015}, publisher={Springer}, keywords={}, annote={} } @article{ma2017reliable, title={Reliable and efficient solution of genome-scale models of Metabolism and macromolecular Expression}, author={Ma, Ding and Yang, Laurence and Fleming, Ronan MT and Thiele, Ines and Palsson, Bernhard O and Saunders, Michael A}, journal={Scientific reports}, volume={7}, number={1}, pages={1--11}, year={2017}, publisher={Nature Publishing Group}, keywords={}, annote={} } @misc{maddock2018boost, title={Boost multiprecision}, author={Maddock, John and Kormanyos, Christopher and others}, year={2018}, keywords={}, annote={} } @inproceedings{McCleeary2019LazyER, title={Lazy exact real arithmetic using floating point operations}, author={Ryan McCleeary}, year={2019}, keywords={}, annote={} } @article{mallasen2022percival, title={{PERCIVAL}: {O}pen-source posit {RISC-V} core with quire capability}, author={Mallas{\'e}n, David and Murillo, Raul and Del Barrio, Alberto A and Botella, Guillermo and Pi{\~n}uel, Luis and Prieto-Matias, Manuel}, journal={IEEE Transactions on Emerging Topics in Computing}, volume={10}, number={3}, pages={1241--1252}, year={2022}, publisher={IEEE}, keywords={}, annote = {PERCIVAL, an application-level posit RISC-V core based on CVA6 that can execute all posit instructions, including the quire fused operations. This solves the obstacle encountered by previous works, which only included partial posit support or had to emulate posits in software. In addition, Xposit, a RISC-V extension for posit instructions, is incorporated into LLVM. Therefore, PERCIVAL is the first work that integrates the complete posit instruction set in the hardware.} } @article{mckeeman1962algorithm, title={Algorithm 135: Crout with equilibration and iteration}, author={McKeeman, William Marshall}, journal={Communications of the ACM}, volume={5}, number={11}, pages={553--555}, year={1962}, publisher={ACM New York, NY, USA} } @article{mellempudi2019mixed, title={Mixed precision training with 8-bit floating point}, author={Mellempudi, Naveen and Srinivasan, Sudarshan and Das, Dipankar and Kaul, Bharat}, journal={arXiv preprint arXiv:1905.12334}, year={2019}, keywords={mixed-precision,8-bit, deep learning}, annote={}, abstract={Reduced precision computation for deep neural networks is one of the key areas addressing the widening ’compute gap’ driven by an exponential growth in model size. Deep learning training has largely migrated to 16-bit precision, with significant performance and energy efficiency gains in recent years. However, attempts to train DNNs at 8-bit precision have met significant challenges because of back-propagation's higher precision and dynamic range requirements. This paper proposes a method to train deep neural networks using 8-bit floating-point representation for weights, activations, errors, and gradients. In addition to reducing compute precision, we reduced the precision requirements for the master copy of weights from 32-bit to 16-bit. We demonstrate state-of-the-art accuracy across multiple data sets (imagenet-1K, WMT16) and a broader set of workloads (Resnet-18/34/50, GNMT, Transformer) than previously reported. We propose an enhanced loss scaling method to augment the reduced subnormal range of 8-bit floating point for improved error propagation. We also examine the impact of quantization noise on generalization and propose a stochastic rounding technique to address gradient noise. As a result of applying all these techniques, we report slightly higher validation accuracy than the full precision baseline.} } @article{micikevicius2022fp8, title={FP8 formats for deep learning}, author={Micikevicius, Paulius and Stosic, Dusan and Burgess, Neil and Cornea, Marius and Dubey, Pradeep and Grisenthwaite, Richard and Ha, Sangwon and Heinecke, Alexander and Judd, Patrick and Kamalu, John and others}, journal={arXiv preprint arXiv:2209.05433}, year={2022}, keywords={}, annote={} } @article{misback2023odyssey, title={Odyssey: An Interactive Workbench for Expert-Driven Floating-Point Expression Rewriting}, author={Misback, Edward and Chan, Caleb and Saiki, Brett and Jun, Eunice and Tatlock, Zachary and Panchekha, Pavel}, journal={arXiv preprint arXiv:2305.10599}, year={2023}, keywords={}, annote={} } @article{mittal2002efficient, title={Efficient solution of a sparse non-symmetric system of linear equations}, author={Mittal, RC and Al-Kurdi, Ahmad}, journal={International journal of computer mathematics}, volume={79}, number={4}, pages={449--463}, year={2002}, publisher={Taylor \& Francis}, keywords={}, annote={} } @article{mittal2002lu, title={{LU}-decomposition and numerical structure for solving large sparse nonsymmetric linear systems}, author={Mittal, RC and Al-Kurdi, A}, journal={Computers \& Mathematics with Applications}, volume={43}, number={1-2}, pages={131--155}, year={2002}, publisher={Elsevier}, keywords={}, annote={} } @article{moler1967iterative, title={Iterative refinement in floating point}, author={Moler, Cleve B}, journal={Journal of the ACM (JACM)}, volume={14}, number={2}, pages={316--321}, year={1967}, publisher={ACM New York, NY, USA}, keywords={}, annote={} } @article{molisch2017hybrid, title={Hybrid beamforming for massive {MIMO}: A survey}, author={Molisch, Andreas F and Ratnam, Vishnu V and Han, Shengqian and Li, Zheda and Nguyen, Sinh Le Hong and Li, Linsheng and Haneda, Katsuyuki}, journal={IEEE Communications magazine}, volume={55}, number={9}, pages={134--141}, year={2017}, publisher={IEEE}, keywords={}, annote={} } @article{montero2019template, title={Template-based posit multiplication for training and inferring in neural networks}, author={Montero, Ra{\'u}l Murillo and Del Barrio, Alberto A and Botella, Guillermo}, journal={arXiv preprint arXiv:1907.04091}, year={2019}, keywords={}, annote={training is done with posit format, achieving promising results for a binary classification problem even with reduced posit configurations. In the inference stage, 8-bit posits are as good as the floating-point when dealing with the MNIST dataset but lose some accuracy with CIFAR-10.} } @article{muhammad2020deep, title={Deep learning for safe autonomous driving: Current challenges and future directions}, author={Muhammad, Khan and Ullah, Amin and Lloret, Jaime and Del Ser, Javier and de Albuquerque, Victor Hugo C}, journal={IEEE Transactions on Intelligent Transportation Systems}, volume={22}, number={7}, pages={4316--4336}, year={2020}, publisher={IEEE}, keywords={}, annote={} } % https://ieeexplore.ieee.org/abstract/document/9446981 % https://ieeexplore.ieee.org/abstract/document/9859379 @article{murillo2021plam, title={PLAM: A posit logarithm-approximate multiplier}, author={Murillo, Raul and Del Barrio, Alberto A and Botella, Guillermo and Kim, Min Soo and Kim, HyunJin and Bagherzadeh, Nader}, journal={IEEE Transactions on Emerging Topics in Computing}, volume={10}, number={4}, pages={2079--2085}, year={2021}, publisher={IEEE}, keywords={}, annote={} } @article{murillo2020deep, title={Deep PeNSieve: A deep learning framework based on the posit number system}, author={Murillo, Raul and Del Barrio, Alberto A and Botella, Guillermo}, journal={Digital Signal Processing}, volume={102}, pages={102762}, year={2020}, publisher={Elsevier}, keywords={}, annote={} } % N % -------------------------- @article{noune2022, title={8-bit Numerical Formats for Deep Neural Networks}, author={Noune, Badreddine and Jones, Philip and Justus, Daniel and Masters, Dominic and Luschi, Carlo}, journal={arXiv preprint arXiv:2206.02915}, year={2022}, keywords={8-bit, deep learning}, annote={improve the computational efficiency of model training, study using 8-bit floating-point number formats for activations, weights, and gradients for training and inference. We explore the effect of different bit-widths for exponents, significands, and different exponent biases., } } % O % -------------------------- @article{ogita2005accurate, title={Accurate sum and dot product}, author={Ogita, Takeshi and Rump, Siegfried M and Oishi, Shin'ichi}, journal={SIAM Journal on Scientific Computing}, volume={26}, number={6}, pages={1955--1988}, year={2005}, publisher={SIAM}, annote = {kjasddkjdlja asljdlkjaddflkj aslkjas dflkkjadd}, keywords = {reliable computing} } @article{oishi2009iterative, title={Iterative refinement for ill-conditioned linear systems}, author={Oishi, Shin'ichi and Ogita, Takeshi and Rump, Siegfried M}, journal={Japan journal of industrial and applied mathematics}, volume={26}, number={2-3}, pages={465--476}, year={2009}, publisher={Japan Society for Industrial and Applied Mathematics}, keywords={}, annote={} } @article{oktay2022multistage, title={Multistage mixed precision iterative refinement}, author={Oktay, Eda and Carson, Erin}, journal={Numerical Linear Algebra with Applications}, volume={29}, number={4}, pages={e2434}, year={2022}, publisher={Wiley Online Library}, keywords={}, annote={} } @article{Omtzigt2023, author = {E. Theodore L. Omtzigt and James Quinlan}, title = {Universal Numbers Library: Multi-format Variable Precision Arithmetic Library}, year = {2023}, journal = {Journal of Open Source Software}, volume = {8}, number = {83}, publisher = {The Open Journal}, pages = {5072}, doi = {10.21105/joss.05072}, keywords={}, annote={} } % Omtzigt et al., (2023). Universal Numbers Library: Multi-format Variable Precision Arithmetic Library. \textit{Journal of Open Source Software, 8(83)}, 5072, https://doi.org/10.21105/joss.05072 % url = {https://doi.org/10.21105/joss.05072}, keywords={}, annote={} % See also: % https://zenodo.org/record/7735084#.ZBEUnezML0o @article{omtzigt2020universal, title={Universal Numbers Library: design and implementation of a high-performance reproducible number systems library}, author={Omtzigt, E Theodore L and Gottschling, Peter and Seligman, Mark and Zorn, William}, journal={arXiv preprint arXiv:2012.11011}, year={2020}, keywords={}, annote={} } @inproceedings{omtzigt2022universal, title={Universal: Reliable, Reproducible, and Energy-Efficient Numerics}, author={Omtzigt, E Theodore L and Quinlan, James}, booktitle={Conference on Next Generation Arithmetic}, pages={100--116}, year={2022}, organization={Springer}, keywords={}, annote={} } @article{ootomo2022recovering, title={Recovering single precision accuracy from Tensor Cores while surpassing the FP32 theoretical peak performance}, author={Ootomo, Hiroyuki and Yokota, Rio}, journal={The International Journal of High Performance Computing Applications}, volume={36}, number={4}, pages={475--491}, year={2022}, publisher={SAGE Publications Sage UK: London, England} } @article{ozaki2012error, title={Error-free transformations of matrix multiplication by using fast routines of matrix multiplication and its applications}, author={Ozaki, Katsuhisa and Ogita, Takeshi and Oishi, Shin’ichi and Rump, Siegfried M}, journal={Numerical Algorithms}, volume={59}, pages={95--118}, year={2012}, publisher={Springer} } % P % -------------------------- @article{palmer2014more, title={More reliable forecasts with less precise computations: a fast-track route to cloud-resolved weather and climate simulators?}, author={Palmer, Tim N}, journal={Philosophical Transactions of the Royal Society A: Mathematical, Physical and Engineering Sciences}, volume={372}, number={2018}, pages={20130391}, year={2014}, publisher={The Royal Society Publishing}, keywords={}, annote={} } @article{pan1991improved, title={An improved Newton iteration for the generalized inverse of a matrix, with applications}, author={Pan, Victor and Schreiber, Robert}, journal={SIAM Journal on Scientific and Statistical Computing}, volume={12}, number={5}, pages={1109--1130}, year={1991}, publisher={SIAM} } % posit standard % Posit Working Group, “Posit Standard Documentation Release 4.12-draft,” Standard Posit Arithmetic, Jul. 2021. [Online]. Available: https://posithub. org/posit_standard4.12.pdf @techreport{posits2022, author = {{P}osit {W}orking {G}roup}, title = {Standard for Posit Arithmetic}, institution = {National Supercomputing Centre (NSCC) Singapore}, year = {2022}, type = {}, number = {}, address = {}, month = {}, note = {}, annote = {Posit Standard 2022}, keywords={}, annote={} } @misc{positviz, author = {Kirdani-Ryan, Mara and Lim, Katie and Smith, Gus and Petrisko, Dan}, title = {Well Rounded: Visualizing Floating Point Representations}, month = {June}, year = {2019}, note = {Accessed: 2023-10-08}, url = {https://cse512-19s.github.io/FP-Well-Rounded}, annote = {Posit Visualization} } % howpublished = "\url{https://cse512-19s.github.io/FP-Well-Rounded}", @article{priest1991algorithms, title={Algorithms for arbitrary precision floating point arithmetic}, author={Priest, Douglas M}, journal={University of California, Berkeley}, year={1991}, publisher={Citeseer}, keywords={}, annote={} } % R % -------------------------- @article{romanov2021analysis, title={Analysis of Posit and Bfloat arithmetic of real numbers for machine learning}, author={Romanov, Aleksandr Yu and Stempkovsky, Alexander L and Lariushkin, Ilia V and Novoselov, Georgy E and Solovyev, Roman A and Starykh, Vladimir A and Romanova, Irina I and Telpukhov, Dmitry V and Mkrtchan, Ilya A}, journal={IEEE Access}, volume={9}, pages={82318--82324}, year={2021}, publisher={IEEE} } @techreport{ruiz2001scaling, title={A scaling algorithm to equilibrate both rows and columns norms in matrices}, author={Ruiz, Daniel}, year={2001}, institution={CM-P00040415} } % Rump @incollection{rump1999intlab, title={INTLAB—interval laboratory}, author={Rump, Siegfried M}, booktitle={Developments in reliable computing}, pages={77--104}, year={1999}, publisher={Springer}, keywords={}, annote={} } @article{rump2009inversion, title={Inversion of extremely ill-conditioned matrices in floating-point}, author={Rump, Siegfried M}, journal={Japan Journal of Industrial and Applied Mathematics}, volume={26}, number={2}, pages={249--277}, year={2009}, publisher={Springer}, keywords={}, annote={} } @article{rump1991class, title={A class of arbitrarily ill-conditioned floating-point matrices}, author={Rump, Siegfried M}, journal={SIAM Journal on Matrix Analysis and Applications}, volume={12}, number={4}, pages={645--653}, year={1991}, publisher={SIAM}, keywords={}, annote={} } @techreport{rump1990approximate, title={Approximate inverses of almost singular matrices still contain useful information}, author={Rump, Siegfried M}, year={1990}, institution={Techn. Univ. Hamburg-Harburg}, keywords={}, annote={} } @article{rump2020faithfully, title={Faithfully Rounded Floating-Point Computation}, author={Rump, Siegfried M and Lange, Marko}, journal={ACM Transactions on Mathematical Software}, volume={46}, number={3}, pages={1--20}, year={2020}, keywords={}, annote={} } % S % -------------------------- @book{saad2003iterative, title={Iterative methods for sparse linear systems}, author={Saad, Yousef}, year={2003}, publisher={SIAM}, keywords={iterative methods, sparse}, keywords={}, annote={} } @article{saad1986gmres, title={GMRES: A generalized minimal residual algorithm for solving nonsymmetric linear systems}, author={Saad, Youcef and Schultz, Martin H}, journal={SIAM Journal on scientific and statistical computing}, volume={7}, number={3}, pages={856--869}, year={1986}, publisher={SIAM} } @inproceedings{saleh2008floating, title={A floating-point fused dot-product unit}, author={Saleh, Hani H and Swartzlander, Earl E}, booktitle={2008 IEEE International Conference on Computer Design}, pages={427--431}, year={2008}, organization={IEEE}, keywords={}, annote={} } @article{san2021low, title={Low precision matrix multiplication for efficient deep learning in NVIDIA carmel processors}, author={San Juan, Pablo and Rodr{\'\i}guez-S{\'a}nchez, Rafael and Igual, Francisco D and Alonso-Jord{\'a}, Pedro and Quintana-Ort{\'\i}, Enrique S}, journal={The Journal of Supercomputing}, volume={77}, pages={11257--11269}, year={2021}, publisher={Springer} } @article{sarra2011radial, title={Radial basis function approximation methods with extended precision floating point arithmetic}, author={Sarra, Scott A}, journal={Engineering Analysis with Boundary Elements}, volume={35}, number={1}, pages={68--76}, year={2011}, publisher={Elsevier}, keywords={}, annote={} } @inproceedings{siek1998mtl, title={The matrix template library: A generic programming approach to high performance numerical linear algebra}, author={Siek, Jeremy G and Lumsdaine, Andrew}, booktitle={International Symposium on Computing in Object-Oriented Parallel Environments}, pages={59--70}, year={1998}, organization={Springer}, keywords={}, annote={} } @inproceedings{siek1998matrix, title={The matrix template library: A unifying framework for numerical linear algebra}, author={Siek, Jeremy G and Lumsdaine, Andrew}, booktitle={European Conference on Object-Oriented Programming}, pages={466--467}, year={1998}, organization={Springer}, keywords={}, annote={} } @article{skeel1980iterative, title={Iterative refinement implies numerical stability for Gaussian elimination}, author={Skeel, Robert D}, journal={Mathematics of Computation}, volume={35}, number={151}, pages={817--832}, year={1980}, keywords={}, annote={} } @article{smoktunowicz2015numerical, title={Numerical stability of iterative refinement with a relaxation for linear systems}, author={Smoktunowicz, Alicja and Kierzkowski, Jakub and Wrobel, Iwona}, journal={arXiv preprint arXiv:1512.04246}, year={2015}, keywords={}, annote={} } @github{stillwater2017universal, title={Universal Number Library}, url={https://github.com/stillwater-sc/universal}, year={2017}, organization={Stillwater Supercomputing, Inc.}, keywords={}, annote={} } @incollection{svyatkovskiy2017training, title={Training distributed deep recurrent neural networks with mixed precision on {GPU} clusters}, author={Svyatkovskiy, Alexey and Kates-Harbeck, Julian and Tang, William}, booktitle={Proceedings of the Machine Learning on HPC Environments}, pages={1--8}, year={2017}, publisher={ACM}, keywords={}, annote={} } @article{suraksha2024silicon, author = {Suraksha, ETtech P}, date = {2024-02-05}, year = {2024}, title = {Bengaluru-based Calligo Tech to receive first silicon with posit computing capability this month}, journal = {The Econmoic Times} } %url = {https://economictimes.indiatimes.com/tech/startups/bengaluru-based-calligo-tech-to-receive-first-silicon-with-posit-computing-capability-this-month/articleshow/107406474.cms}, urldate = {2024-02-05} } % T % -------------------------- @article{tomov2010towards, title={Towards dense linear algebra for hybrid GPU accelerated manycore systems}, author={Tomov, Stanimire and Dongarra, Jack and Baboulin, Marc}, journal={Parallel Computing}, volume={36}, number={5-6}, pages={232--240}, year={2010}, publisher={Elsevier} } @book{trefethen2022numerical, title={Numerical linear algebra}, author={Trefethen, Lloyd N and Bau, David}, volume={181}, year={2022}, publisher={Siam} } % p. 243 Why Iterate? % Lists table of sizes of matrix computation over the years. % 1950: n = 20 (Wilkinson) % 1965: n = 200 (Forsythe and Moler) % 1980: n = 2000 (LINPACK) % 1995: n = 20000 (LAPACK) % U % -------------------------- @article{uccar2011symmetry, title={A Symmetry Preserving Algorithm for Matrix Scaling}, author={U{\c{c}}ar, Daniel Ruiz—Bora}, year={2011} } @inproceedings{uguen2019hardware, author={Uguen, Yohann and Forget, Luc and de Dinechin, Florent}, booktitle={2019 29th International Conference on Field Programmable Logic and Applications (FPL)}, title={Evaluating the Hardware Cost of the Posit Number System}, year={2019}, volume={}, number={}, pages={106-113}, doi={10.1109/FPL.2019.00026}, abstract = {The posit number system is proposed to replace IEEE floating-point numbers. It is a floating-point system that trades exponent bits for significand bits, depending on the magnitude of the numbers. Thus, it provides more precision for numbers around one at the expense of lower precision for very large or very small numbers. Several works have demonstrated that this trade-off can improve the accuracy of applications. However, the variable-length exponent and significand encoding impact the hardware cost of posit arithmetic. The objective of the present work is to enable application-level evaluations of the posit system that include performance and resource consumption. This article introduces an open-source hardware implementation of the posit number system in the form of a C++ templatized library compatible with Vivado HLS. This library currently implements addition, subtraction, and multiplication for custom-size posits. In addition, the posit standard also mandates the presence of the ``quire,'' a large accumulator able to perform exact sums of products. The proposed library includes the first open-source parameterized hardware quire. This library improves state-of-the-art posit implementations regarding latency and resource consumption. Still, standard 32 bits posit adders and multipliers are found to be much larger and slower than the corresponding floating-point operators. The cost of the posit 32 quire is comparable to that of a Kulisch accumulator for 32 bits floating-point.} } % V % -------------------------- @incollection{vestias2020processing, title={Processing systems for deep learning inference on edge devices}, author={V{\'e}stias, M{\'a}rio}, booktitle={Convergence of Artificial Intelligence and the Internet of Things}, pages={213--240}, year={2020}, publisher={Springer}, keywords={}, annote={} } % W % -------------------------- @article{wang2019bfloat16, title={BFloat16: The secret to high performance on Cloud TPUs}, author={Wang, Shibo and Kanwar, Pankaj}, journal={Google Cloud Blog}, volume={4}, year={2019}, keywords={}, annote={} } @article{wilkinson1965rounding, title={Rounding errors in algebraic processes}, author={Wilkinson, JH}, year={1965}, publisher={Prentice-Hall, Englewood Cliffs, NJ, USA}, abstract={Introduced iterative refinement method.} } @book{jwilkinson1994rounding, author = {Wilkinson, James Hardy}, keywords = {analysis computation error mathematics method numerical}, publisher = {Dover Publications, Inc.}, title = {Rounding Errors in Algebraic Processes}, year = {1994}, keywords={}, annote = {This introduces the topic of error analysis in numeric computation. Topics include fundamental arithmetic operations, computations involving polynomial and matrix computations. Precision and accuracy of solution methods are examined. Originally published by Prentice-Hall Inc., Englewood Cliffs, 1963.} } @book{jwilkinson1963rounding, author = {Wilkinson, James Hardy}, keywords = {analysis computation error mathematics method numerical}, publisher = {Prentice-Hall Inc.}, title = {Rounding Errors in Algebraic Processes}, year = {1963}, annote = {This introduces the topic of error analysis in numeric computation. Topics include fundamental arithmetic operations, computations involving polynomial and matrix computations. Precision and accuracy of solution methods are examined.} } @incollection{wilkinson1971algebraic, title={The algebraic eigenvalue problem}, author={Wilkinson, JH}, booktitle={Handbook for Automatic Computation, Volume II, Linear Algebra}, year={1971}, publisher={Springer-Verlag New York}, keywords={}, annote={} } @article{wilkinson1977use, title={The use of single-precision residuals in the solution of linear systems}, author={Wilkinson, JH}, journal={Unpublished manuscript, NPL}, year={1977}, keywords={}, annote={} } @article{wu2021low, title={Low-precision floating-point arithmetic for high-performance FPGA-based CNN acceleration}, author={Wu, Chen and Wang, Mingyu and Chu, Xinyuan and Wang, Kun and He, Lei}, journal={ACM Transactions on Reconfigurable Technology and Systems (TRETS)}, volume={15}, number={1}, pages={1--21}, year={2021}, publisher={ACM New York, NY} } % X % -------------------------- @article{xu2003gmres, title={A Convergent Restarted {GMRES} Method For Large Linear Systems}, author={Xu, Minghua and Zhao, Jinxi and Wu, Jiancheng and Fan, Hongjun}, year={2003} } % Y % -------------------------- @inproceedings{yamazaki2022high, title={High-Performance GMRES Multi-Precision Benchmark: Design, Performance, and Challenges}, author={Yamazaki, Ichitaro and Glusa, Christian and Loe, Jennifer and Luszczek, Piotr and Rajamanickam, Sivasankaran and Dongarra, Jack}, booktitle={2022 IEEE/ACM International Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS)}, pages={112--122}, year={2022}, organization={IEEE}, keywords={}, annote={} } % Z % -------------------------- @article{zhang2016matrix, title={Matrix Depot: an extensible test matrix collection for Julia}, author={Zhang, Weijian and Higham, Nicholas J}, journal={PeerJ Computer Science}, volume={2}, pages={e58}, year={2016}, publisher={PeerJ Inc.}, keywords={}, annote={} } @article{zhang2023revisiting, title={Revisiting Block-based Quantisation: What is Important for Sub-8-bit LLM Inference?}, author={Zhang, Cheng and Cheng, Jianyi and Shumailov, Ilia and Constantinides, George A and Zhao, Yiren}, journal={arXiv preprint arXiv:2310.05079}, year={2023}, keywords={8-bit LLM}, annote={The inference of Large language models (LLMs) requires immense computation and memory resources. Quantization has emerged as a promising solution to curtail these costs, but existing LLM quantization mainly focuses on 8-bit. In this work, we explore the statistical and learning properties of the LLM layer and attribute the bottleneck of LLM quantization to numerical scaling offsets. To address this, we adapt block quantizations for LLMs, a family of methods that share scaling factors across packed numbers. Block quantizations efficiently reduce the numerical scaling offsets solely from an arithmetic perspective without additional treatments in the computational path. Our nearly lossless quantized 6-bit LLMs achieve a 19× higher arithmetic density and 5× memory density than the float32 baseline, surpassing the prior art 8-bit quantization by 2.5× in arithmetic density and 1.2× in memory density without requiring any data calibration or re-training. We also share our insights into sub-8-bit LLM quantization, including the mismatch between activation and weight distributions, optimal fine-tuning strategies, and a lower quantization granularity inherent in the statistical properties of LLMs. The latter two tricks enable nearly lossless 4-bit LLMs on downstream tasks. The proposed framework will be open-sourced upon publication.} } % See also: https://link.springer.com/article/10.1007/s11554-020-00984-x @article{zhao2022numerical, title={Numerical Investigation into the Mixed Precision GMRES (m) Method Using FP64 and FP32}, author={Zhao, Yingqi and Fukaya, Takeshi and Zhang, Linjie and Iwashita, Takeshi}, journal={Journal of Information Processing}, volume={30}, pages={525--537}, year={2022}, publisher={Information Processing Society of Japan}, keywords={}, annote={} } @article{zlatev1982use, title={Use of iterative refinement in the solution of sparse linear systems}, author={Zlatev, Zahari}, journal={SIAM Journal on Numerical Analysis}, volume={19}, number={2}, pages={381--399}, year={1982}, publisher={SIAM}, keywords={}, annote={} } @article{zuras2008ieee, title={IEEE standard for floating-point arithmetic}, author={Zuras, Dan and Cowlishaw, Mike and Aiken, Alex and Applegate, Matthew and Bailey, David and Bass, Steve and Bhandarkar, Dileep and Bhat, Mahesh and Bindel, David and Boldo, Sylvie and others}, journal={IEEE Std}, volume={754}, number={2008}, pages={1--70}, year={2008}, keywords={}, annote={} }